Initialization

In [1]:
rm(list = ls())

library(data.table)
library(tidyverse)
library(rJava)
library(RNetLogo)

library(lhs)  # For maximin Latin hypercube sampling
library(ggplot2)
library(plotly)  # For beautiful plotting
library(caret)
library(randomForest)
library(factoextra)
library(e1071)
library(TSrepr)  # for evaluating predictive power

require(gridExtra)

options(warn = -1)
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.2.1     v purrr   0.3.3
v tibble  2.1.3     v dplyr   0.8.3
v tidyr   1.0.0     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::between()   masks data.table::between()
x dplyr::filter()    masks stats::filter()
x dplyr::first()     masks data.table::first()
x purrr::flatten()   masks jsonlite::flatten()
x dplyr::lag()       masks stats::lag()
x dplyr::last()      masks data.table::last()
x purrr::transpose() masks data.table::transpose()
Loading required package: igraph

Attaching package: 'igraph'

The following objects are masked from 'package:dplyr':

    as_data_frame, groups, union

The following objects are masked from 'package:purrr':

    compose, simplify

The following object is masked from 'package:tidyr':

    crossing

The following object is masked from 'package:tibble':

    as_data_frame

The following objects are masked from 'package:stats':

    decompose, spectrum

The following object is masked from 'package:base':

    union


Attaching package: 'plotly'

The following object is masked from 'package:igraph':

    groups

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:dplyr':

    combine

The following object is masked from 'package:ggplot2':

    margin

Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
Loading required package: gridExtra

Attaching package: 'gridExtra'

The following object is masked from 'package:randomForest':

    combine

The following object is masked from 'package:dplyr':

    combine

In [2]:
folder.path = "C:/Users/paslanpatir/Desktop/TEZ_v2/"
data.path = paste0(folder.path,"data/")

nl.model.1 = "Segregation"
model.type.1 = ifelse(nl.model.1 == "Segregation", "basic", "dummy")
output.folder.1 = paste0("outputs/outputs_50_100/outputs_WRep_Basic")
outputs.path.1 = paste0(folder.path,output.folder.1,"/")

nl.model.2 = ifelse(nl.model.1 == "Segregation", "Segregation_Dummy", "Segregation")
model.type.2 = ifelse(model.type.1 == "basic", "dummy", "basic")
output.folder.2 = paste0("outputs/outputs_50_100/outputs_WRep_Dummy")
outputs.path.2 = paste0(folder.path,output.folder.2,"/")

Unlabeled Pool

In [3]:
unlabeled_ins = 100
In [4]:
unlabeled_pool.name.1= paste0(data.path,"unlabeled_pool","_",model.type.1,"_",unlabeled_ins,".csv")
unlabeled_pool.name.1 <- fread(unlabeled_pool.name.1)  

data_candidates.1 = copy(unlabeled_pool.name.1)

unlabeled_pool.name.2= paste0(data.path,"unlabeled_pool","_",model.type.2,"_",unlabeled_ins,".csv")
unlabeled_pool.name.2 <- fread(unlabeled_pool.name.2)  

data_candidates.2 = copy(unlabeled_pool.name.2)
In [5]:
data_candidates_bind = rbind(data.table(data_candidates.1[,.(density,`%-similar-wanted`)], model = model.type.1)
                            ,data.table(data_candidates.2[,.(density,`%-similar-wanted`)], model = model.type.2))
data_candidates_plot = ggplot(data = data_candidates_bind, aes(x = density, y = `%-similar-wanted`)) +
                        geom_point(aes(color = model)) +
                        facet_wrap(~model) +
                        ggtitle(paste0("unlabeled data for models"))
data_candidates_plot
#ggsave(paste0(outputs.path,"unlabeled_", model.type,".png"))

Test Set

In [6]:
test.seed.1 = c(0,1,2)
test.seed.2 = c(0,2)

test_ins = 100
In [7]:
test_set_all.1 = data.table()
for( i in test.seed.1){
    
    test_set.name.1= paste0(data.path,"test_set","_",model.type.1,"_",test_ins,"_seed",i,".csv")
    test_set.1 <- fread(test_set.name.1) 
    
    test_set_all.1 = rbind(test_set_all.1,data.table(test_set.1, "seed" = i))
    rm(test_set.1,test_set.name.1)    
}
test_set_all.2 = data.table()
for( i in test.seed.2){
    
    test_set.name.2= paste0(data.path,"test_set","_",model.type.2,"_",test_ins,"_seed",i,".csv")
    test_set.2 <- fread(test_set.name.2) 
    
    test_set_all.2 = rbind(test_set_all.2,data.table(test_set.2, "seed" = i))
    rm(test_set.2,test_set.name.2)    
}
In [8]:
test_set.name.1= paste0(data.path,"test_set","_",model.type.1,"_",test_ins,".csv")
test_set.1 <- fread(test_set.name.1)  

test_set.name.2= paste0(data.path,"test_set","_",model.type.2,"_",test_ins,".csv")
test_set.2 <- fread(test_set.name.2)
In [9]:
test_set_bind = rbind(data.table(test_set_all.1[,.(density,`%-similar-wanted`,seed)], model = model.type.1)
                     ,data.table(test_set_all.2[,.(density,`%-similar-wanted`,seed)], model = model.type.2))

test_set_plot = ggplot(data = test_set_bind, aes(x = density, y = `%-similar-wanted`)) +
                    geom_point(aes(color = model)) +
                    facet_grid(seed~model) +
                    ggtitle(paste0("test data for models"))
test_set_plot

Aynı seedler için basic'te de dummy'de de iki important feature'ın dağılımı aynı.

Oneshot Data

In [10]:
seed.oneshot.1 = c(0,1,2,3,4,5,6,7,8,20)
seed.oneshot.2 = c(0,1,2,3,4,5,6,7,8,20)
train_ins_oneshot = 100
In [11]:
training_set_all.1 = data.table()
for( i in seed.oneshot.1){
    
    training_set.name= paste0(data.path,"training_set","_",model.type.1,"_",train_ins_oneshot,"_seed",i,".csv")
    training_set <- fread(training_set.name) 

    training_set_all.1 = rbind(training_set_all.1,data.table(training_set, "seed" = i))
    rm(training_set,training_set.name)    
}

training_set_all.2 = data.table()
for( i in seed.oneshot.2){
    
    training_set.name= paste0(data.path,"training_set","_",model.type.2,"_",train_ins_oneshot,"_seed",i,".csv")
    training_set <- fread(training_set.name) 

    training_set_all.2 = rbind(training_set_all.2,data.table(training_set, "seed" = i))
    rm(training_set,training_set.name)    
}
In [12]:
one_shot_data.1 = copy(training_set_all.1)
one_shot_data.2 = copy(training_set_all.2)
In [13]:
one_shot_bind = rbind(data.table(one_shot_data.1[,.(density,`%-similar-wanted`,seed,output)], model = model.type.1)
                     ,data.table(one_shot_data.2[,.(density,`%-similar-wanted`,seed,output)], model = model.type.2))
one_shot_plot = ggplot(data = one_shot_bind, aes(x = density, y = `%-similar-wanted`)) +
                    geom_point(aes(colour = output)) +
                    facet_grid(seed~model) +
                    labs(legend = "output") +
                    ggtitle(paste0("one_shot_data for models "))
one_shot_plot
In [14]:
# Eğer üst üste koyarsak, 10 tane seed'in toplamda cover edemediği alanları rahatlıkla görebiliriz.
ggplot(data = one_shot_bind, aes(x = density, y = `%-similar-wanted`)) +
                    geom_point(aes(colour = output)) +
                    facet_grid(~model) +
                    labs(legend = "output") +
                    ggtitle(paste0("one_shot_data for models "))

Adaptive Initial Data

In [15]:
seed.Ad.1 = c(0,1,2,3,4,5,6,7,8,20)
seed.Ad.2 = c(0,1,2,3,4,5,6,7,8,20)
train_ins_Ad = 50
In [16]:
training_set_Ad_all.1 = data.table()
for (i in seed.Ad.1) {
    
    training_set.name = paste0(data.path, "training_set", "_", model.type.1, "_", train_ins_Ad, "_seed", i, ".csv")
    training_set <- fread(training_set.name)
    
    training_set_Ad_all.1 = rbind(training_set_Ad_all.1, data.table(training_set, seed = i))
    rm(training_set, training_set.name)
}

training_set_Ad_all.2 = data.table()
for (i in seed.Ad.2) {
    
    training_set.name = paste0(data.path, "training_set", "_", model.type.2, "_", train_ins_Ad, "_seed", i, ".csv")
    training_set <- fread(training_set.name)
    
    training_set_Ad_all.2 = rbind(training_set_Ad_all.2, data.table(training_set, seed = i))
    rm(training_set, training_set.name)
}
In [17]:
adaptive_initial_data.1 = copy(training_set_Ad_all.1)
adaptive_initial_data.2 = copy(training_set_Ad_all.2)
In [18]:
adaptive_initial_bind = rbind(data.table(adaptive_initial_data.1[,.(density,`%-similar-wanted`,seed,output)], model = model.type.1)
                             ,data.table(adaptive_initial_data.2[,.(density,`%-similar-wanted`,seed,output)], model = model.type.2))

adaptive_initial_plot = ggplot(data = adaptive_initial_bind, aes(x = density, y = `%-similar-wanted`)) +
                           geom_point(aes(colour = output)) +
                           facet_grid(seed~model) +
                           labs(legend = "output") +
                           ggtitle(paste0("initial_adaptive_data for models "))
adaptive_initial_plot
In [19]:
# Eğer üst üste koyarsak, 10 tane seed'in toplamda cover edemediği alanları rahatlıkla görebiliriz.
ggplot(data = adaptive_initial_bind, aes(x = density, y = `%-similar-wanted`)) +
                    geom_point(aes(colour = output)) +
                    facet_grid(~model) +
                    labs(legend = "output") +
                    ggtitle(paste0("overlapped initial adaptive data for models"))

OneShot

In [20]:
#### OneShot ####

Model.1

In [21]:
#### Model.1 ####
one_shot_path.1 = paste0(outputs.path.1,"oneshot/")

obb_error_oneshot.1 = fread(paste0(one_shot_path.1,model.type.1,"_obb_error_oneshot_all",".csv"))
performance_table_oneshot.1 = fread(paste0(one_shot_path.1,model.type.1,"_performance_table_oneshot",".csv"))
predictedLabels_oneshot.1 = fread(paste0(one_shot_path.1,model.type.1,"_predictedLabels_oneshot_all",".csv"))
In [22]:
performance_molten_oneshot.1 <- melt(data = performance_table_oneshot.1
                             , id.vars = c('iter',"seed","rep"))
setnames(performance_molten_oneshot.1, c("variable","value"),c("errortype","errorvalue"))

oob error

In [23]:
#### oob error ####
oob_oneshot_plot.1 = ggplot(obb_error_oneshot.1, aes(x=as.factor(rep), y = obb_error)) + 
                        geom_point(aes(colour = as.factor(rep))) +
                #geom_hline( aes(yintercept = obb_error, color = obb_error )) + 
                       # facet_grid(rep ~ seed) +
                        facet_wrap(~ seed) +
                        ggtitle(paste0("oob error with oneshot sampling for ",model.type.1)) 
oob_oneshot_plot.1

test error

In [24]:
#### test error ####
one_shot_ind_p.1 = ggplot(performance_molten_oneshot.1, aes(x=as.factor(rep),y = errorvalue, group=errortype, col=errortype)) + 
                            geom_point() + 
                            facet_wrap( ~ seed) +
                            geom_hline(yintercept = mean(performance_molten_oneshot.1[errortype=="rmse"]$errorvalue), color = "green") +
                            ggtitle(paste0("test error with Oneshot Sampling for ",model.type.1)) 
one_shot_ind_p.1
In [25]:
bxp.oneshot.1 <-  boxplot( data = performance_molten_oneshot.1[errortype == "rmse"], errorvalue  ~ seed, col = "green")
bxp.oneshot.1
$stats
A matrix: 5 × 10 of type dbl
4.8375714.4380663.1357314.8357353.9872764.1706623.5234084.7046714.0620945.637496
4.9197224.5540953.2144874.9243554.1585854.2443463.6203144.7558654.1209855.652675
5.0220524.6587823.2373915.0541114.2379884.2967533.6557554.8095824.1873095.709888
5.1131754.7357813.2811635.1482114.3195514.4330533.7790714.8721354.2422145.761358
5.2702744.8593543.3629315.1646764.4437094.4950013.8634124.9172254.3630535.833708
$n
  1. 10
  2. 10
  3. 10
  4. 10
  5. 10
  6. 10
  7. 10
  8. 10
  9. 10
  10. 10
$conf
A matrix: 2 × 10 of type dbl
4.9253954.5680053.2040774.9422634.1575634.2024683.5764344.7514894.1267385.655585
5.1187094.7495603.2707055.1659584.3184134.3910393.7350764.8676754.2478795.764191
$out
  1. 3.353070703
  2. 4.424964074
$group
  1. 7
  2. 9
$names
  1. '0'
  2. '1'
  3. '2'
  4. '3'
  5. '4'
  6. '5'
  7. '6'
  8. '7'
  9. '8'
  10. '20'
In [26]:
grid.arrange(oob_oneshot_plot.1,one_shot_ind_p.1, ncol = 2)

Seed 20 : train datasını iyi öğrenmiş ama test datasında zayıfsız kalmış Seed 8 : 20'nin tam tersi Seed 6 : trainde dalgalanma geniş ama test için iyi denebilir.

Actual vs Fitted

In [27]:
#### Actual vs Fitted ####
slct_seed= 6 
a_vs_f_oneshot.1 <- ggplot(predictedLabels_oneshot.1[seed == slct_seed]
                           ,aes(x = output, y =pred_output, color = pred_output - output)) +
            geom_point() +
            geom_abline() +
            facet_wrap(~ rep) +
            xlab("actual values") +
            ylab("fitted values") +
            ggtitle(paste0("Actual vs Fitted for OneShot Sampling","\n"," seed:",slct_seed,"\n","for ",model.type.1)) 

a_vs_f_oneshot.1

Replications on Each Seed

In [28]:
#### Replications on Each Seed ####

one_shot_ind_bxp.1 = ggplot(data = performance_molten_oneshot.1, aes(y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot() +
                    facet_wrap(~ seed,scales = "free") +
                    ggtitle(paste0(" test performances","\n","with Oneshot Sampling for model_",model.type.1)) 
one_shot_ind_bxp.1
In [29]:
performance_table_oneshot.1[, .(mean_rmse = mean(rmse)),.(seed)][order(mean_rmse)]
A data.table: 10 × 2
seedmean_rmse
<int><dbl>
23.245273
63.661898
84.210791
44.244389
54.323117
14.656075
74.812622
35.032114
05.032954
205.718107
In [30]:
one_shot_ind_pbxp.1 = ggplot(data = performance_table_oneshot.1, aes(y =rmse, x = rep )) +
                    geom_point(aes(colour = as.factor(rep))) +
                    geom_boxplot() +
                    facet_wrap(~ seed) +
                    labs(legend = "rmse") 
one_shot_ind_pbxp.1

Overall BoxPlot

In [31]:
one_shot_bxp.1 = ggplot(data = performance_molten_oneshot.1, aes(y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                   # facet_wrap(~ seed) +
                    labs( caption = paste0("meanRMSE : ",mean(performance_table_oneshot.1$rmse))) +
                    ggtitle(paste0("overall test performace","\n"," with Oneshot Sampling for ",model.type.1))
                    
one_shot_bxp.1
In [32]:
boxplot.stats(performance_table_oneshot.1$rmse)
$stats
  1. 3.135730619
  2. 4.150533966
  3. 4.493373706
  4. 4.922242515
  5. 5.8337077
$n
100
$conf
  1. 4.371443755258
  2. 4.615303656742
$out

Model.2

In [33]:
#### Model.2 ####
one_shot_path.2 = paste0(outputs.path.2,"oneshot/")

obb_error_oneshot.2 = fread(paste0(one_shot_path.2,model.type.2,"_obb_error_oneshot_all",".csv"))
performance_table_oneshot.2 = fread(paste0(one_shot_path.2,model.type.2,"_performance_table_oneshot",".csv"))
predictedLabels_oneshot.2 = fread(paste0(one_shot_path.2,model.type.2,"_predictedLabels_oneshot_all",".csv"))
In [34]:
performance_molten_oneshot.2 <- melt(data = performance_table_oneshot.2
                             , id.vars = c('iter',"seed","rep"))
setnames(performance_molten_oneshot.2, c("variable","value"),c("errortype","errorvalue"))

oob error

In [35]:
#### oob error ####
oob_oneshot_plot.2 = ggplot(obb_error_oneshot.2, aes(x=as.factor(rep), y = obb_error)) + 
                        geom_point(aes(colour = as.factor(rep))) +
                #geom_hline( aes(yintercept = obb_error, color = obb_error )) + 
                       # facet_grid(rep ~ seed) +
                        facet_wrap(~ seed) +
                        ggtitle(paste0("oob error with oneshot sampling for ",model.type.2)) 
oob_oneshot_plot.2

test error

In [36]:
#### test error ####
one_shot_ind_p.2 = ggplot(performance_molten_oneshot.2, aes(x=as.factor(rep),y = errorvalue, group=errortype, col=errortype)) + 
                            geom_point() + 
                            facet_wrap( ~ seed) +
                            geom_hline(yintercept = mean(performance_molten_oneshot.2[errortype=="rmse"]$errorvalue), color = "green") +   
                            ggtitle(paste0("test error with Oneshot Sampling for ",model.type.2)) 
one_shot_ind_p.2
In [37]:
bxp.oneshot.2 <-  boxplot( data = performance_molten_oneshot.2[errortype == "rmse"], errorvalue  ~ seed, col = "green")
bxp.oneshot.2
$stats
A matrix: 5 × 10 of type dbl
8.4702268.49048610.29345 9.6129358.4950838.523314 9.574276 9.1780078.7214419.128286
8.6266508.69302410.37989 9.6931028.5472388.854202 9.640304 9.4099958.9116719.254552
8.8100808.74801110.56635 9.7952548.6015969.066436 9.788990 9.6410899.0497689.333117
8.8431998.86585310.65648 9.9807458.7607899.127384 9.953325 9.7372379.2509089.405477
9.0274919.12027610.7993710.2362098.8841589.36801910.21965110.0137549.3694829.492823
$n
  1. 10
  2. 10
  3. 10
  4. 10
  5. 10
  6. 10
  7. 10
  8. 10
  9. 10
  10. 10
$conf
A matrix: 2 × 10 of type dbl
8.7018848.66165810.428169.6515368.4948988.9299449.6325929.4775868.8802719.257709
8.9182778.83436310.704549.9389718.7082959.2029289.9453879.8045929.2192649.408525
$out
10.44226244
$group
4
$names
  1. '0'
  2. '1'
  3. '2'
  4. '3'
  5. '4'
  6. '5'
  7. '6'
  8. '7'
  9. '8'
  10. '20'
In [38]:
grid.arrange(oob_oneshot_plot.2,one_shot_ind_p.2, ncol = 2)

Actual vs Fitted

In [39]:
#### Actual vs Fitted ####
slct_seed= 2 
a_vs_f_oneshot.2 <- ggplot(predictedLabels_oneshot.2[seed == slct_seed]
                           ,aes(x = output, y =pred_output, color = pred_output - output)) +
            geom_point() +
            geom_abline() +
            facet_wrap(~ rep) +
            xlab("actual values") +
            ylab("fitted values") +
            ggtitle(paste0("Actual vs Fitted for OneShot Sampling","\n"," seed:",slct_seed,"\n","for ",model.type.2)) 

a_vs_f_oneshot.2

Replications on Each Seed

In [40]:
#### Replications on Each Seed ####
one_shot_ind_bxp.2 = ggplot(data = performance_molten_oneshot.2, aes(y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot() +
                    facet_wrap(~ seed, scales = "free") +
                    ggtitle(paste0(" test performances","\n","with Oneshot Sampling for model_",model.type.2)) 
one_shot_ind_bxp.2
In [41]:
performance_table_oneshot.2[, .(mean_rmse = mean(rmse)),.(seed)][order(mean_rmse)]
A data.table: 10 × 2
seedmean_rmse
<int><dbl>
4 8.652122
0 8.741549
1 8.785981
5 9.006951
8 9.064582
20 9.319068
7 9.617516
6 9.822358
3 9.883133
210.539543
In [42]:
one_shot_ind_pbxp.2 = ggplot(data = performance_table_oneshot.2, aes(y =rmse, x = rep )) +
                    geom_point(aes(colour = as.factor(rep))) +
                    geom_boxplot() +
                    facet_wrap(~ seed) +
                    labs(legend = "rmse") 
one_shot_ind_pbxp.2

Overall BoxPlot

In [43]:
one_shot_bxp.2 = ggplot(data = performance_molten_oneshot.2, aes(y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                   # facet_wrap(~ seed) +
                    labs( caption = paste0("meanRMSE : ",mean(performance_table_oneshot.2$rmse))) +
                    ggtitle(paste0("overall test performace","\n"," with Oneshot Sampling for ",model.type.2))
                    
one_shot_bxp.2
In [44]:
boxplot.stats(performance_table_oneshot.2$rmse)
$stats
  1. 8.470226185
  2. 8.8130155215
  3. 9.252729903
  4. 9.7328418525
  5. 10.79937472
$n
100
$conf
  1. 9.107397342702
  2. 9.398062463298
$out

Model.1 vs Model.2

In [45]:
cbind( performance_table_oneshot.1[, .(mean_rmse = mean(rmse)),.(seed)][order(mean_rmse)]
      ,performance_table_oneshot.2[, .(mean_rmse = mean(rmse)),.(seed)][order(mean_rmse)])
A data.table: 10 × 4
seedmean_rmseseedmean_rmse
<int><dbl><int><dbl>
23.245273 4 8.652122
63.661898 0 8.741549
84.210791 1 8.785981
44.244389 5 9.006951
54.323117 8 9.064582
14.65607520 9.319068
74.812622 7 9.617516
35.032114 6 9.822358
05.032954 3 9.883133
205.718107 210.539543

Random Sampling Replications

In [46]:
#### Random Sampling Replications ####

Model.1

In [47]:
#### Model.1 ####
Rd_path.1 = paste0(outputs.path.1,"Rd/")

obb_error_Rd.1              = fread(paste0(Rd_path.1,model.type.1,"_obb_error_Rd",".csv"))
performance_table_Rd.1      = fread(paste0(Rd_path.1,model.type.1,"_performance_table_Rd",".csv"))
predictedLabels_Rd.1        = fread(paste0(Rd_path.1,model.type.1,"_predictedLabels_table_Rd",".csv"))
FinalTrainData_Rd.1         = fread(paste0(Rd_path.1,model.type.1,"_FinalTrainData_Rd",".csv"))
train_candidates_Rd.1       = fread(paste0(Rd_path.1,model.type.1,"_train_candidates_table_Rd",".csv"))
In [48]:
performance_molten_Rd_seq.1 <- melt(data = performance_table_Rd.1
                             , id.vars = c('iter',"seed","rep"))
setnames(performance_molten_Rd_seq.1, c("variable","value"),c("errortype","errorvalue"))
In [49]:
predictedLabels_molten_Rd_seq.1 <- melt(data = predictedLabels_Rd.1
                             , id.vars = c("density","%-similar-wanted",'output',"seed","rep")
                             , measure.vars = c("pred_output_1","pred_output_2","pred_output_3","pred_output_4","pred_output_5","pred_output_6","pred_output_7","pred_output_8","pred_output_9","pred_output_10","pred_output_11"))
setnames(predictedLabels_molten_Rd_seq.1, c("variable","value"),c("iter","pred_output"))
predictedLabels_molten_Rd_seq.1[,iter := as.numeric(str_sub(iter, 13))]

oob error

In [50]:
#### oob error ####
oob_Rd_seq_plot.1 = ggplot(obb_error_Rd.1, aes(x=iter, y = obb_error)) + 
                geom_line( aes(color = as.factor(rep))) + 
                facet_grid(rep ~ seed) +
                ggtitle(paste0("sequential oob error with Rd Sampling for ",model.type.1)) 
oob_Rd_seq_plot.1

test error

In [51]:
#### test error ####
test_error_Rd_seq_plot.1 = ggplot(performance_molten_Rd_seq.1, aes(x=iter,y = errorvalue, group=errortype, col=errortype)) + 
                            geom_line() + 
                            facet_grid(rep ~ seed) +
                            ggtitle(paste0("sequential test error with Rd Sampling for ",model.type.1)) 
test_error_Rd_seq_plot.1

Actual vs Fitted

In [52]:
#### Actual vs Fitted ####
# The last iteration(pred_output_11) on the 10th replication
slct_rep = 10
slct_seed= 2
a_vs_f_Rd_seq.1 <- ggplot(predictedLabels_molten_Rd_seq.1[rep == slct_rep & seed == slct_seed]
                 ,aes(x = output, y =pred_output, color = pred_output - output)) +
            geom_point() +
            geom_abline() +
            facet_wrap( ~ iter) +
            xlab("actual values") +
            ylab("fitted values") +
            ggtitle(paste0("Actual vs Fitted for Rd Sampling","\n","rep:",slct_rep," and seed:",slct_seed,"\n","for ",model.type.1)) 

a_vs_f_Rd_seq.1

Replications on Each Iteration

In [53]:
#### Replications on Each Iteration ####
# Observe the boxplots in each iterations
Rd_seq_ind_bxp.1 = ggplot(performance_molten_Rd_seq.1, aes(y = errorvalue, group=errortype, col=errortype)) + 
                  geom_boxplot()+
                  facet_wrap(~iter) +
                #  geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
                  ggtitle(paste0("sequential test performances","\n","with Rd Sampling for model_",model.type.1))
Rd_seq_ind_bxp.1
In [54]:
Rd_seq_ind_bxp_seed.1 = ggplot(performance_molten_Rd_seq.1
                               , aes(y = errorvalue, group = errortype, col = errortype)) + 
                            geom_boxplot() + 
                            facet_grid(seed ~ iter ) + 
                            ggtitle(paste0("sequential test performances by Seeds","\n","with Rd Sampling for ",model.type.1))
Rd_seq_ind_bxp_seed.1

Overall BoxPlot

In [55]:
#### Overall BoxPlot ####
Rd_seq_bxp.1 = ggplot(data = performance_molten_Rd_seq.1[iter == 11], aes( y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                    labs( caption = paste0("final meanRMSE : ",mean(performance_table_Rd.1[iter == 11]$rmse))) +
                    ggtitle(paste0("final iteration test performace","\n"," with Rd Sampling for ",model.type.1))
                    
Rd_seq_bxp.1
In [56]:
boxplot.stats(performance_table_Rd.1[iter == 11]$rmse)
$stats
  1. 3.292855356
  2. 4.3594440445
  3. 4.877001925
  4. 6.196094353
  5. 8.614730941
$n
100
$conf
  1. 4.586811176257
  2. 5.167192673743
$out
9.293791462

Train Candidates

In [57]:
#### Train Candidates ####
tc_Rd_seq_plot.1 = ggplot(adaptive_initial_data.1, aes(x = density, y = `%-similar-wanted`)) +
               geom_point(color = "grey") +
               geom_point(data = train_candidates_Rd.1[rep == 10], aes(colour = as.factor(iter))) +
               facet_wrap(~ seed) + 
               labs(legend = "output") +
               ggtitle(paste0("Train Candidates for each Seed","\n","with Rd Sampling for ",model.type.1))
tc_Rd_seq_plot.1

Final Data

In [58]:
#### Final Data ####
final_train_data_Rd.1 = ggplot(data = FinalTrainData_Rd.1, aes(x = density, y = `%-similar-wanted`)) +
                           geom_point(aes(colour = output)) +
                           facet_grid(seed~rep) +
                            #facet_warp(~rep) +
                            #facet_warp(~seed) +
                           labs(legend = "output") +
                           ggtitle(paste0("final Rd data for model_ ",model.type.1))
final_train_data_Rd.1

Model.2

In [59]:
#### Model.2 ####
Rd_path.2 = paste0(outputs.path.2,"Rd/")

obb_error_Rd.2              = fread(paste0(Rd_path.2,model.type.2,"_obb_error_Rd",".csv"))
performance_table_Rd.2      = fread(paste0(Rd_path.2,model.type.2,"_performance_table_Rd",".csv"))
predictedLabels_Rd.2        = fread(paste0(Rd_path.2,model.type.2,"_predictedLabels_table_Rd",".csv"))
FinalTrainData_Rd.2         = fread(paste0(Rd_path.2,model.type.2,"_FinalTrainData_Rd",".csv"))
train_candidates_Rd.2       = fread(paste0(Rd_path.2,model.type.2,"_train_candidates_table_Rd",".csv"))
In [60]:
performance_molten_Rd_seq.2 <- melt(data = performance_table_Rd.2
                             , id.vars = c('iter',"seed","rep"))
setnames(performance_molten_Rd_seq.2, c("variable","value"),c("errortype","errorvalue"))

predictedLabels_molten_Rd_seq.2 <- melt(data = predictedLabels_Rd.2
                             , id.vars = c("density","%-similar-wanted",'output',"seed","rep")
                             , measure.vars = c("pred_output_1","pred_output_2","pred_output_3","pred_output_4","pred_output_5","pred_output_6","pred_output_7","pred_output_8","pred_output_9","pred_output_10","pred_output_11"))
setnames(predictedLabels_molten_Rd_seq.2, c("variable","value"),c("iter","pred_output"))
predictedLabels_molten_Rd_seq.2[,iter := as.numeric(str_sub(iter, 13))]

oob error

In [61]:
#### oob error ####
oob_Rd_seq_plot.2 = ggplot(obb_error_Rd.2, aes(x=iter, y = obb_error)) + 
                geom_line( aes(color = as.factor(rep))) + 
                facet_grid(rep ~ seed) +
                ggtitle(paste0("sequential oob error with Rd Sampling for ",model.type.2)) 
oob_Rd_seq_plot.2

test error

In [62]:
#### test error ####
test_error_Rd_seq_plot.2 = ggplot(performance_molten_Rd_seq.2, aes(x=iter,y = errorvalue, group=errortype, col=errortype)) + 
                            geom_line() + 
                            facet_grid(rep ~ seed) +
                            ggtitle(paste0("sequential test error with Rd Sampling for ",model.type.2)) 
test_error_Rd_seq_plot.2

Actual vs Fitted

In [63]:
#### Actual vs Fitted ####
slct_rep = 10
slct_seed= 2
a_vs_f_Rd_seq.2 <- ggplot(predictedLabels_molten_Rd_seq.2[rep == slct_rep & seed == slct_seed]
                 ,aes(x = output, y =pred_output, color = pred_output - output)) +
            geom_point() +
            geom_abline() +
            facet_wrap( ~ iter) +
            xlab("actual values") +
            ylab("fitted values") +
            ggtitle(paste0("Actual vs Fitted for Rd Sampling","\n","rep:",slct_rep," and seed:",slct_seed,"\n","for ",model.type.2)) 

a_vs_f_Rd_seq.2

Replications on Each Iteration

In [64]:
#### Replications on Each Iteration ####
# Observe the boxplots in each iterations
Rd_seq_ind_bxp.2 = ggplot(performance_molten_Rd_seq.2, aes(y = errorvalue, group=errortype, col=errortype)) + 
                  geom_boxplot()+
                  facet_wrap(~iter) +
                #  geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
                  ggtitle(paste0("sequential test performances","\n","with Rd Sampling for model_",model.type.2))
Rd_seq_ind_bxp.2
In [65]:
Rd_seq_ind_bxp_seed.2 = ggplot(performance_molten_Rd_seq.2
                               , aes(y = errorvalue, group = errortype, col = errortype)) + 
                            geom_boxplot() + 
                            facet_grid(seed ~ iter ) + 
                            ggtitle(paste0("sequential test performances by Seeds","\n","with Rd Sampling for ",model.type.2))
Rd_seq_ind_bxp_seed.2

Overall BoxPlot

In [66]:
#### Overall BoxPlot ####
Rd_seq_bxp.2 = ggplot(data = performance_molten_Rd_seq.2[iter == 11], aes( y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                    labs( caption = paste0("final meanRMSE : ",mean(performance_table_Rd.2[iter == 11]$rmse))) +
                    ggtitle(paste0("final iteration test performace","\n"," with Rd Sampling for ",model.type.2))
                    
Rd_seq_bxp.2
In [67]:
boxplot.stats(performance_table_Rd.2[iter == 11]$rmse)
$stats
  1. 8.430070319
  2. 9.1561261655
  3. 9.5123156825
  4. 9.908136319
  5. 10.95774338
$n
100
$conf
  1. 9.393498078247
  2. 9.631133286753
$out
12.22456254

Train Candidates

In [68]:
#### Train Candidates ####
tc_Rd_seq_plot.2 = ggplot(adaptive_initial_data.2, aes(x = density, y = `%-similar-wanted`)) +
               geom_point(color = "grey") +
               geom_point(data = train_candidates_Rd.2[rep == 10], aes(colour = as.factor(iter))) +
               facet_wrap(~ seed) + 
               labs(legend = "output") +
               ggtitle(paste0("Train Candidates for each Seed","\n","with Rd Sampling for ",model.type.2))
tc_Rd_seq_plot.2

Final Data

In [69]:
#### Final Data ####
final_train_data_Rd.2 = ggplot(data = FinalTrainData_Rd.2, aes(x = density, y = `%-similar-wanted`)) +
                           geom_point(aes(colour = output)) +
                           facet_grid(seed~rep) +
                            #facet_warp(~rep) +
                            #facet_warp(~seed) +
                           labs(legend = "output") +
                           ggtitle(paste0("final Rd data for model_ ",model.type.2))
final_train_data_Rd.2

Model.1 vs Model.2

In [70]:
grid.arrange(Rd_seq_ind_bxp.1,Rd_seq_ind_bxp.2)
In [71]:
grid.arrange(Rd_seq_bxp.1,Rd_seq_bxp.2)

Adaptive Sampling Replications

In [72]:
#### Adaptive Sampling ####

Model.1

In [73]:
#### Model.1 ####
Ad_path.1 = paste0(outputs.path.1,"Ad_sd/")

obb_error_Ad.1              = fread(paste0(Ad_path.1,model.type.1,"_obb_error_Ad",".csv"))
performance_table_Ad.1      = fread(paste0(Ad_path.1,model.type.1,"_performance_table_Ad",".csv"))
predictedLabels_Ad.1        = fread(paste0(Ad_path.1,model.type.1,"_predictedLabels_table_Ad",".csv"))
FinalTrainData_Ad.1         = fread(paste0(Ad_path.1,model.type.1,"_FinalTrainData_Ad",".csv"))
train_candidates_Ad.1       = fread(paste0(Ad_path.1,model.type.1,"_train_candidates_table_Ad",".csv"))
In [74]:
performance_molten_Ad_seq.1 <- melt(data = performance_table_Ad.1
                             , id.vars = c('iter',"seed","rep"))
setnames(performance_molten_Ad_seq.1, c("variable","value"),c("errortype","errorvalue"))

predictedLabels_molten_Ad_seq.1 <- melt(data = predictedLabels_Ad.1
                             , id.vars = c("density","%-similar-wanted",'output',"seed","rep")
                             , measure.vars = c("pred_output_1","pred_output_2","pred_output_3","pred_output_4","pred_output_5","pred_output_6","pred_output_7","pred_output_8","pred_output_9","pred_output_10","pred_output_11"))
setnames(predictedLabels_molten_Ad_seq.1, c("variable","value"),c("iter","pred_output"))
predictedLabels_molten_Ad_seq.1[,iter := as.numeric(str_sub(iter, 13))]

oob error

In [75]:
#### oob error ####
oob_Ad_seq_plot.1 = ggplot(obb_error_Ad.1, aes(x=iter, y = obb_error)) + 
                geom_line( aes(color = as.factor(rep))) + 
                facet_grid(rep ~ seed) +
                ggtitle(paste0("sequential oob error with Ad Sampling for ",model.type.1)) 
oob_Ad_seq_plot.1

test error

In [76]:
#### test error ####
test_error_Ad_seq_plot.1 = ggplot(performance_molten_Ad_seq.1, aes(x=iter,y = errorvalue, group=errortype, col=errortype)) + 
                            geom_line() + 
                            facet_grid(rep ~ seed) +
                            ggtitle(paste0("sequential test error with Ad Sampling for ",model.type.1)) 
test_error_Ad_seq_plot.1

Actual vs Fitted

In [77]:
#### Actual vs Fitted ####
# The last iteration(pred_output_11) on the 10th replication
slct_rep = 10
slct_seed= 2
a_vs_f_Ad_seq.1 <- ggplot(predictedLabels_molten_Ad_seq.1[rep == slct_rep & seed == slct_seed]
                 ,aes(x = output, y =pred_output, color = pred_output - output)) +
            geom_point() +
            geom_abline() +
            facet_wrap( ~ iter) +
            xlab("actual values") +
            ylab("fitted values") +
            ggtitle(paste0("Actual vs Fitted for Ad Sampling ","\n","rep:",slct_rep," and seed:",slct_seed,"\n","for ",model.type.1)) 

a_vs_f_Ad_seq.1

Replications on Each Iteration

In [78]:
#### Replications on Each Iteration ####
# Observe the boxplots in each iterations
Ad_seq_ind_bxp.1 = ggplot(performance_molten_Ad_seq.1, aes(y = errorvalue, group=errortype, col=errortype)) + 
                  geom_boxplot()+
                  facet_wrap(~iter) +
                #  geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
                  ggtitle(paste0("sequential test performances","\n","with Ad Sampling for model_",model.type.1))
Ad_seq_ind_bxp.1
In [79]:
Ad_seq_ind_bxp_seed.1 = ggplot(performance_molten_Ad_seq.1
                               , aes(y = errorvalue, group = errortype, col = errortype)) + 
                            geom_boxplot() + 
                            facet_grid(seed ~ iter ) + 
                            ggtitle(paste0("sequential test performances by Seeds","\n","with Ad Sampling for ",model.type.1))
Ad_seq_ind_bxp_seed.1

Overall BoxPlot

In [80]:
#### Overall BoxPlot ####
Ad_seq_bxp.1 = ggplot(data = performance_molten_Ad_seq.1[iter == 11], aes( y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                    labs( caption = paste0("final meanRMSE : ",mean(performance_table_Ad.1[iter == 11]$rmse))) +
                    ggtitle(paste0("final iteration test performace","\n"," with Ad Sampling for ",model.type.1))
                    
Ad_seq_bxp.1
In [81]:
boxplot.stats(performance_table_Ad.1[iter == 11]$rmse)
$stats
  1. 3.206474218
  2. 4.05373283
  3. 4.263984198
  4. 4.8150782695
  5. 5.55327999
$n
100
$conf
  1. 4.143691618559
  2. 4.384276777441
$out

Train Candidates

In [82]:
#### Train Candidates ####
tc_Ad_seq_plot.2 = ggplot(adaptive_initial_data.1, aes(x = density, y = `%-similar-wanted`)) +
                       geom_point(color = "grey") +
                       geom_point(data = train_candidates_Ad.1[rep == 10], aes(colour = as.factor(iter))) +
                       facet_wrap(~ seed) + 
                       labs(legend = "output") +
                       ggtitle(paste0("Train Candidates for each Seed","\n","with Ad Sampling for ",model.type.1))
tc_Ad_seq_plot.2

Final Data

In [83]:
#### Final Data ####
final_train_data_Ad.1 = ggplot(data = FinalTrainData_Ad.1, aes(x = density, y = `%-similar-wanted`)) +
                           geom_point(aes(colour = output)) +
                           facet_grid(seed~rep) +
                            #facet_warp(~rep) +
                            #facet_warp(~seed) +
                           labs(legend = "output") +
                           ggtitle(paste0("final Ad data for model_ ",model.type.1))
final_train_data_Ad.1

Model.2

In [84]:
#### Model.1 ####
Ad_path.2 = paste0(outputs.path.2,"Ad_sd/")

obb_error_Ad.2              = fread(paste0(Ad_path.2,model.type.2,"_obb_error_Ad",".csv"))
performance_table_Ad.2      = fread(paste0(Ad_path.2,model.type.2,"_performance_table_Ad",".csv"))
predictedLabels_Ad.2        = fread(paste0(Ad_path.2,model.type.2,"_predictedLabels_table_Ad",".csv"))
FinalTrainData_Ad.2         = fread(paste0(Ad_path.2,model.type.2,"_FinalTrainData_Ad",".csv"))
train_candidates_Ad.2       = fread(paste0(Ad_path.2,model.type.2,"_train_candidates_table_Ad",".csv"))
In [85]:
performance_molten_Ad_seq.2 <- melt(data = performance_table_Ad.2
                             , id.vars = c('iter',"seed","rep"))
setnames(performance_molten_Ad_seq.2, c("variable","value"),c("errortype","errorvalue"))

predictedLabels_molten_Ad_seq.2 <- melt(data = predictedLabels_Ad.2
                             , id.vars = c("density","%-similar-wanted",'output',"seed","rep")
                             , measure.vars = c("pred_output_1","pred_output_2","pred_output_3","pred_output_4","pred_output_5","pred_output_6","pred_output_7","pred_output_8","pred_output_9","pred_output_10","pred_output_11"))
setnames(predictedLabels_molten_Ad_seq.2, c("variable","value"),c("iter","pred_output"))
predictedLabels_molten_Ad_seq.2[,iter := as.numeric(str_sub(iter, 13))]

oob error

In [86]:
#### oob error ####
oob_Ad_seq_plot.2 = ggplot(obb_error_Ad.2, aes(x=iter, y = obb_error)) + 
                geom_line( aes(color = as.factor(rep))) + 
                facet_grid(rep ~ seed) +
                ggtitle(paste0("sequential oob error with Ad Sampling for ",model.type.2)) 
oob_Ad_seq_plot.2

test error

In [87]:
#### test error ####
test_error_Ad_seq_plot.2 = ggplot(performance_molten_Ad_seq.2, aes(x=iter,y = errorvalue, group=errortype, col=errortype)) + 
                            geom_line() + 
                            facet_grid(rep ~ seed) +
                            ggtitle(paste0("sequential test error with Ad Sampling for ",model.type.2)) 
test_error_Ad_seq_plot.2

Actual vs Fitted

In [88]:
#### Actual vs Fitted ####
slct_rep = 10
slct_seed= 2
a_vs_f_Ad_seq.2 <- ggplot(predictedLabels_molten_Ad_seq.2[rep == slct_rep & seed == slct_seed]
                 ,aes(x = output, y =pred_output, color = pred_output - output)) +
            geom_point() +
            geom_abline() +
            facet_wrap( ~ iter) +
            xlab("actual values") +
            ylab("fitted values") +
            ggtitle(paste0("Actual vs Fitted for Ad Sampling","\n","rep:",slct_rep," and seed:",slct_seed,"\n","for ",model.type.2)) 

a_vs_f_Ad_seq.2

Replications on Each Iteration

In [89]:
#### Replications on Each Iteration ####
# Observe the boxplots in each iterations
Ad_seq_ind_bxp.2 = ggplot(performance_molten_Ad_seq.2, aes(y = errorvalue, group=errortype, col=errortype)) + 
                  geom_boxplot()+
                  facet_wrap(~iter) +
                #  geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
                  ggtitle(paste0("sequential test performances","\n","with Ad Sampling for model_",model.type.2))
Ad_seq_ind_bxp.2
In [90]:
Ad_seq_ind_bxp_seed.2 = ggplot(performance_molten_Ad_seq.2
                               , aes(y = errorvalue, group = errortype, col = errortype)) + 
                            geom_boxplot() + 
                            facet_grid(seed ~ iter ) + 
                            ggtitle(paste0("sequential test performances by Seeds","\n","with Ad Sampling for ",model.type.2))
Ad_seq_ind_bxp_seed.2

Overall BoxPlot

In [91]:
#### Overall BoxPlot ####
Ad_seq_bxp.2 = ggplot(data = performance_molten_Ad_seq.2[iter == 11], aes( y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                    labs( caption = paste0("final meanRMSE : ",mean(performance_table_Ad.2[iter == 11]$rmse))) +
                    ggtitle(paste0("final iteration test performace","\n"," with Ad Sampling for ",model.type.2))
                    
Ad_seq_bxp.2
In [92]:
boxplot.stats(performance_table_Ad.2[iter == 11]$rmse)
$stats
  1. 7.422622077
  2. 7.8861256165
  3. 8.20837666
  4. 8.592483558
  5. 9.235862333
$n
100
$conf
  1. 8.096772105243
  2. 8.319981214757
$out

Train Candidates

In [93]:
#### Train Candidates ####
tc_Ad_seq_plot.2 = ggplot(adaptive_initial_data.2, aes(x = density, y = `%-similar-wanted`)) +
               geom_point(color = "grey") +
               geom_point(data = train_candidates_Ad.2[rep == 10], aes(colour = as.factor(iter))) +
               facet_wrap(~ seed) + 
               labs(legend = "output") +
               ggtitle(paste0("Train Candidates for each Seed","\n","with Ad Sampling for ",model.type.2))
tc_Ad_seq_plot.2

Final Data

In [94]:
#### Final Data ####
final_train_data_Ad.2 = ggplot(data = FinalTrainData_Ad.2, aes(x = density, y = `%-similar-wanted`)) +
                           geom_point(aes(colour = output)) +
                           facet_grid(seed~rep) +
                            #facet_warp(~rep) +
                            #facet_warp(~seed) +
                           labs(legend = "output") +
                           ggtitle(paste0("final Ad data for model_ ",model.type.2))
final_train_data_Ad.2

Model.1 vs Model.2

In [95]:
grid.arrange(Ad_seq_ind_bxp.1,Ad_seq_ind_bxp.2)
In [96]:
grid.arrange(Ad_seq_bxp.1,Ad_seq_bxp.2)

3 Scenarios

Model.1

In [97]:
#### all BoxPlot ####
all_bxp.1 = ggplot(data = rbind(data.table(performance_molten_Ad_seq.1[iter == 11], sampling = "3Ad")
                                   ,data.table(performance_molten_Rd_seq.1[iter == 11], sampling = "2Rd")
                                   ,data.table(performance_molten_oneshot.1, sampling = "1oneshot"))
                    , aes( y = errorvalue, group=errortype, col=errortype)) +
                    facet_wrap(~sampling) +
                    geom_boxplot(aes(colour = errortype)) +
                    ggtitle(paste0("final iteration test performace","\n"," with 3 samplings for ",model.type.1))
                    
all_bxp.1

Model.2

In [98]:
#### all BoxPlot ####
all_bxp.2 = ggplot(data = rbind(data.table(performance_molten_Ad_seq.2[iter == 11], sampling = "3Ad")
                                   ,data.table(performance_molten_Rd_seq.2[iter == 11], sampling = "2Rd")
                                   ,data.table(performance_molten_oneshot.2, sampling = "1oneshot"))
                    , aes( y = errorvalue, group=errortype, col=errortype)) +
                    facet_wrap(~sampling) +
                    geom_boxplot(aes(colour = errortype)) +
                    ggtitle(paste0("final iteration test performace","\n"," with 3 Samplings for ",model.type.2))
                    
all_bxp.2

Adaptive Sampling Replications With Range

In [99]:
#### Adaptive Sampling  With Range ####

Model.1

In [100]:
#### Model.1 ####
Ad_range_path.1 = paste0(outputs.path.1,"Ad_range/")

obb_error_Ad_range.1              = fread(paste0(Ad_range_path.1,model.type.1,"_obb_error_Ad",".csv"))
performance_table_Ad_range.1      = fread(paste0(Ad_range_path.1,model.type.1,"_performance_table_Ad",".csv"))
predictedLabels_Ad_range.1        = fread(paste0(Ad_range_path.1,model.type.1,"_predictedLabels_table_Ad",".csv"))
FinalTrainData_Ad_range.1         = fread(paste0(Ad_range_path.1,model.type.1,"_FinalTrainData_Ad",".csv"))
train_candidates_Ad_range.1       = fread(paste0(Ad_range_path.1,model.type.1,"_train_candidates_table_Ad",".csv"))
In [101]:
performance_molten_Ad_range_seq.1 <- melt(data = performance_table_Ad_range.1
                             , id.vars = c('iter',"seed","rep"))
setnames(performance_molten_Ad_range_seq.1, c("variable","value"),c("errortype","errorvalue"))

predictedLabels_molten_Ad_range_seq.1 <- melt(data = predictedLabels_Ad_range.1
                             , id.vars = c("density","%-similar-wanted",'output',"seed","rep")
                             , measure.vars = c("pred_output_1","pred_output_2","pred_output_3","pred_output_4","pred_output_5","pred_output_6","pred_output_7","pred_output_8","pred_output_9","pred_output_10","pred_output_11"))
setnames(predictedLabels_molten_Ad_range_seq.1, c("variable","value"),c("iter","pred_output"))
predictedLabels_molten_Ad_range_seq.1[,iter := as.numeric(str_sub(iter, 13))]

oob error

In [102]:
#### oob error ####
oob_Ad_range_seq_plot.1 = ggplot(obb_error_Ad_range.1, aes(x=iter, y = obb_error)) + 
                geom_line( aes(color = as.factor(rep))) + 
                facet_grid(rep ~ seed) +
                ggtitle(paste0("sequential oob error with Ad_range Sampling for ",model.type.1)) 
oob_Ad_range_seq_plot.1

test error

In [103]:
#### test error ####
test_error_Ad_range_seq_plot.1 = ggplot(performance_molten_Ad_range_seq.1, aes(x=iter,y = errorvalue, group=errortype, col=errortype)) + 
                            geom_line() + 
                            facet_grid(rep ~ seed) +
                            ggtitle(paste0("sequential test error with Ad_range Sampling for ",model.type.1)) 
test_error_Ad_range_seq_plot.1

Actual vs Fitted

In [104]:
#### Actual vs Fitted ####
# The last iteration(pred_output_11) on the 10th replication
slct_rep = 10
slct_seed= 2
a_vs_f_Ad_range_seq.1 <- ggplot(predictedLabels_molten_Ad_range_seq.1[rep == slct_rep & seed == slct_seed]
                 ,aes(x = output, y =pred_output, color = pred_output - output)) +
            geom_point() +
            geom_abline() +
            facet_wrap( ~ iter) +
            xlab("actual values") +
            ylab("fitted values") +
            ggtitle(paste0("Actual vs Fitted for Ad_range Sampling ","\n","rep:",slct_rep," and seed:",slct_seed,"\n","for ",model.type.1)) 

a_vs_f_Ad_range_seq.1

Replications on Each Iteration

In [105]:
#### Replications on Each Iteration ####
# Observe the boxplots in each iterations
Ad_range_seq_ind_bxp.1 = ggplot(performance_molten_Ad_range_seq.1, aes(y = errorvalue, group=errortype, col=errortype)) + 
                  geom_boxplot()+
                  facet_wrap(~iter) +
                #  geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
                  ggtitle(paste0("sequential test performances","\n","with Ad_range Sampling for model_",model.type.1))
Ad_range_seq_ind_bxp.1
In [106]:
Ad_range_seq_ind_bxp_seed.1 = ggplot(performance_molten_Ad_range_seq.1
                               , aes(y = errorvalue, group = errortype, col = errortype)) + 
                            geom_boxplot() + 
                            facet_grid(seed ~ iter ) + 
                            ggtitle(paste0("sequential test performances by Seeds","\n","with Ad_range Sampling for ",model.type.1))
Ad_range_seq_ind_bxp_seed.1

Overall BoxPlot

In [107]:
#### Overall BoxPlot ####
Ad_range_seq_bxp.1 = ggplot(data = performance_molten_Ad_range_seq.1[iter == 11], aes( y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                    labs( caption = paste0("final meanRMSE : ",mean(performance_table_Ad_range.1[iter == 11]$rmse))) +
                    ggtitle(paste0("final iteration test performace","\n"," with Ad_range Sampling for ",model.type.1))
                    
Ad_range_seq_bxp.1
In [108]:
boxplot.stats(performance_table_Ad_range.1[iter == 11]$rmse)
$stats
  1. 3.340618452
  2. 4.108782821
  3. 4.3908823925
  4. 4.8619712115
  5. 5.942414519
$n
100
$conf
  1. 4.271878626801
  2. 4.509886158199
$out
  1. 6.299480616
  2. 6.013382504

Train Candidates

In [109]:
#### Train Candidates ####
tc_Ad_range_seq_plot.2 = ggplot(adaptive_initial_data.1, aes(x = density, y = `%-similar-wanted`)) +
                       geom_point(color = "grey") +
                       geom_point(data = train_candidates_Ad_range.1[rep == 10], aes(colour = as.factor(iter))) +
                       facet_wrap(~ seed) + 
                       labs(legend = "output") +
                       ggtitle(paste0("Train Candidates for each Seed","\n","with Ad_range Sampling for ",model.type.1))
tc_Ad_range_seq_plot.2

Final Data

In [110]:
#### Final Data ####
final_train_data_Ad_range.1 = ggplot(data = FinalTrainData_Ad_range.1, aes(x = density, y = `%-similar-wanted`)) +
                           geom_point(aes(colour = output)) +
                           facet_grid(seed~rep) +
                            #facet_warp(~rep) +
                            #facet_warp(~seed) +
                           labs(legend = "output") +
                           ggtitle(paste0("final Ad_range data for model_ ",model.type.1))
final_train_data_Ad_range.1

Model.2

In [111]:
#### Model.2 ####
Ad_range_path.2 = paste0(outputs.path.2,"Ad_range/")

obb_error_Ad_range.2              = fread(paste0(Ad_range_path.2,model.type.2,"_obb_error_Ad",".csv"))
performance_table_Ad_range.2      = fread(paste0(Ad_range_path.2,model.type.2,"_performance_table_Ad",".csv"))
predictedLabels_Ad_range.2        = fread(paste0(Ad_range_path.2,model.type.2,"_predictedLabels_table_Ad",".csv"))
FinalTrainData_Ad_range.2         = fread(paste0(Ad_range_path.2,model.type.2,"_FinalTrainData_Ad",".csv"))
train_candidates_Ad_range.2       = fread(paste0(Ad_range_path.2,model.type.2,"_train_candidates_table_Ad",".csv"))
In [112]:
performance_molten_Ad_range_seq.2 <- melt(data = performance_table_Ad_range.2
                             , id.vars = c('iter',"seed","rep"))
setnames(performance_molten_Ad_range_seq.2, c("variable","value"),c("errortype","errorvalue"))

predictedLabels_molten_Ad_range_seq.2 <- melt(data = predictedLabels_Ad_range.2
                             , id.vars = c("density","%-similar-wanted",'output',"seed","rep")
                             , measure.vars = c("pred_output_1","pred_output_2","pred_output_3","pred_output_4","pred_output_5","pred_output_6","pred_output_7","pred_output_8","pred_output_9","pred_output_10","pred_output_11"))
setnames(predictedLabels_molten_Ad_range_seq.2, c("variable","value"),c("iter","pred_output"))
predictedLabels_molten_Ad_range_seq.2[,iter := as.numeric(str_sub(iter, 13))]

oob error

In [113]:
#### oob error ####
oob_Ad_range_seq_plot.2 = ggplot(obb_error_Ad_range.2, aes(x=iter, y = obb_error)) + 
                geom_line( aes(color = as.factor(rep))) + 
                facet_grid(rep ~ seed) +
                ggtitle(paste0("sequential oob error with Ad_range Sampling for ",model.type.2)) 
oob_Ad_range_seq_plot.2

test error

In [114]:
#### test error ####
test_error_Ad_range_seq_plot.2 = ggplot(performance_molten_Ad_range_seq.2, aes(x=iter,y = errorvalue, group=errortype, col=errortype)) + 
                            geom_line() + 
                            facet_grid(rep ~ seed) +
                            ggtitle(paste0("sequential test error with Ad_range Sampling for ",model.type.2)) 
test_error_Ad_range_seq_plot.2

Actual vs Fitted

In [115]:
#### Actual vs Fitted ####
slct_rep = 10
slct_seed= 2
a_vs_f_Ad_range_seq.2 <- ggplot(predictedLabels_molten_Ad_range_seq.2[rep == slct_rep & seed == slct_seed]
                 ,aes(x = output, y =pred_output, color = pred_output - output)) +
            geom_point() +
            geom_abline() +
            facet_wrap( ~ iter) +
            xlab("actual values") +
            ylab("fitted values") +
            ggtitle(paste0("Actual vs Fitted for Ad_range Sampling","\n","rep:",slct_rep," and seed:",slct_seed,"\n","for ",model.type.2)) 

a_vs_f_Ad_range_seq.2

Replications on Each Iteration

In [116]:
#### Replications on Each Iteration ####
# Observe the boxplots in each iterations
Ad_range_seq_ind_bxp.2 = ggplot(performance_molten_Ad_range_seq.2, aes(y = errorvalue, group=errortype, col=errortype)) + 
                  geom_boxplot()+
                  facet_wrap(~iter) +
                #  geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
                  ggtitle(paste0("sequential test performances","\n","with Ad_range Sampling for model_",model.type.2))
Ad_range_seq_ind_bxp.2
In [117]:
Ad_range_seq_ind_bxp_seed.2 = ggplot(performance_molten_Ad_range_seq.2
                               , aes(y = errorvalue, group = errortype, col = errortype)) + 
                            geom_boxplot() + 
                            facet_grid(seed ~ iter ) + 
                            ggtitle(paste0("sequential test performances by Seeds","\n","with Ad_range Sampling for ",model.type.2))
Ad_range_seq_ind_bxp_seed.2

Overall BoxPlot

In [118]:
#### Overall BoxPlot ####
Ad_range_seq_bxp.2 = ggplot(data = performance_molten_Ad_range_seq.2[iter == 11], aes( y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                    labs( caption = paste0("final meanRMSE : ",mean(performance_table_Ad_range.2[iter == 11]$rmse))) +
                    ggtitle(paste0("final iteration test performace","\n"," with Ad Sampling for ",model.type.2))
                    
Ad_range_seq_bxp.2
In [119]:
boxplot.stats(performance_table_Ad_range.2[iter == 11]$rmse)
$stats
  1. 7.635228706
  2. 8.467118838
  3. 8.69378575
  4. 9.447025294
  5. 10.51337816
$n
100
$conf
  1. 8.538960529952
  2. 8.848610970048
$out
11.17798416

Train Candidates

In [120]:
#### Train Candidates ####
tc_Ad_range_seq_plot.2 = ggplot(adaptive_initial_data.2, aes(x = density, y = `%-similar-wanted`)) +
               geom_point(color = "grey") +
               geom_point(data = train_candidates_Ad_range.2[rep == 10], aes(colour = as.factor(iter))) +
               facet_wrap(~ seed) + 
               labs(legend = "output") +
               ggtitle(paste0("Train Candidates for each Seed","\n","with Ad_range Sampling for ",model.type.2))
tc_Ad_range_seq_plot.2

Final Data

In [121]:
#### Final Data ####
final_train_data_Ad_range.2 = ggplot(data = FinalTrainData_Ad_range.2, aes(x = density, y = `%-similar-wanted`)) +
                           geom_point(aes(colour = output)) +
                           facet_grid(seed~rep) +
                            #facet_warp(~rep) +
                            #facet_warp(~seed) +
                           labs(legend = "output") +
                           ggtitle(paste0("final Ad_range data for model_ ",model.type.2))
final_train_data_Ad_range.2

Model.1 vs Model.2

In [122]:
grid.arrange(Ad_range_seq_ind_bxp.1,Ad_range_seq_ind_bxp.2)
In [123]:
grid.arrange(Ad_range_seq_bxp.1,Ad_range_seq_bxp.2)

Ad vs Ad_range

Model.1

In [124]:
AdS_seq_ind_bxp.1 = ggplot(rbind(data.table(performance_molten_Ad_seq.1, sampling = "1Ad")
                                ,data.table(performance_molten_Ad_range_seq.1, sampling = "2Ad_range"))
                                , aes(y = errorvalue, group=errortype, col=errortype)) + 
                  geom_boxplot()+
                  facet_grid(sampling~iter) +
                #  geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
                  ggtitle(paste0("sequential test performances","\n","with Ad Samplings for model_",model.type.1))
AdS_seq_ind_bxp.1
In [125]:
grid.arrange(Ad_seq_bxp.1,Ad_range_seq_bxp.1, ncol = 2)

Model.2

In [126]:
AdS_seq_ind_bxp.2 = ggplot(rbind(data.table(performance_molten_Ad_seq.2, sampling = "1Ad")
                                ,data.table(performance_molten_Ad_range_seq.2, sampling = "2Ad_range"))
                                , aes(y = errorvalue, group=errortype, col=errortype)) + 
                  geom_boxplot()+
                  facet_grid(sampling~iter) +
                #  geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
                  ggtitle(paste0("sequential test performances","\n","with Ad Samplings for model_",model.type.2))
AdS_seq_ind_bxp.2
In [127]:
grid.arrange(Ad_seq_bxp.2,Ad_range_seq_bxp.2, ncol = 2)

4 Scenarios_Part 1

Model.1

In [128]:
rmse_graph_molten_data.1 = rbind(data.table(performance_molten_Ad_seq.1[errortype == "rmse"], sampling = "2Ad", oneshot = 0)
                              ,data.table(performance_molten_oneshot.1[errortype == "rmse",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "2Ad",oneshot=1)
                              ,data.table(performance_molten_Ad_range_seq.1[errortype == "rmse"], sampling = "3Ad_range",oneshot=0)
                              ,data.table(performance_molten_oneshot.1[errortype == "rmse",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "3Ad_range",oneshot=1)
                              ,data.table(performance_molten_Rd_seq.1[errortype == "rmse"], sampling = "1Rd",oneshot=0)
                              ,data.table(performance_molten_oneshot.1[errortype == "rmse",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "1Rd",oneshot=1)
                             )

ggplot(rmse_graph_molten_data.1, aes(y = errorvalue, col=as.factor(oneshot))) + 
                  geom_boxplot()+
                  facet_grid(sampling~iter) +
                ggtitle(paste0("RMSE plots of Sequential Sampling vs oneshot(iter= 12)","\n","for dummy"))
In [129]:
mae_graph_molten_data.1 = rbind(data.table(performance_molten_Ad_seq.1[errortype == "mae"], sampling = "2Ad", oneshot = 0)
                             ,data.table(performance_molten_oneshot.1[errortype == "mae",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "2Ad",oneshot=1)
                             ,data.table(performance_molten_Ad_range_seq.1[errortype == "mae"], sampling = "3Ad_range",oneshot=0)
                             ,data.table(performance_molten_oneshot.1[errortype == "mae",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "3Ad_range",oneshot=1)
                             ,data.table(performance_molten_Rd_seq.1[errortype == "mae"], sampling = "1Rd",oneshot=0)
                             ,data.table(performance_molten_oneshot.1[errortype == "mae",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "1Rd",oneshot=1)
                             )

ggplot(mae_graph_molten_data.1, aes(y = errorvalue, col=as.factor(oneshot))) + 
                  geom_boxplot()+
                  facet_grid(sampling~iter) +
                ggtitle(paste0("MAE plots of Sequential Sampling vs oneshot(iter= 12)","\n","for dummy"))
In [130]:
mape_graph_molten_data.1 = rbind(data.table(performance_molten_Ad_seq.1[errortype == "mape"], sampling = "2Ad", oneshot = 0)
                             ,data.table(performance_molten_oneshot.1[errortype == "mape",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "2Ad",oneshot=1)
                             ,data.table(performance_molten_Ad_range_seq.1[errortype == "mape"], sampling = "3Ad_range",oneshot=0)
                             ,data.table(performance_molten_oneshot.1[errortype == "mape",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "3Ad_range",oneshot=1)
                             ,data.table(performance_molten_Rd_seq.1[errortype == "mape"], sampling = "1Rd",oneshot=0)
                             ,data.table(performance_molten_oneshot.1[errortype == "mape",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "1Rd",oneshot=1)
                             )

ggplot(mape_graph_molten_data.1, aes(y = errorvalue, col=as.factor(oneshot))) + 
                  geom_boxplot()+
                  facet_grid(sampling~iter) +
                ggtitle(paste0("MAPE plots of Sequential Sampling vs oneshot(iter= 12)","\n","for dummy"))
In [131]:
all_range_bxp.1 = ggplot(data = rbind(data.table(performance_molten_Ad_range_seq.1[iter == 11], sampling = "4Ad_range")
                                     ,data.table(performance_molten_Ad_seq.1[iter == 11], sampling = "3Ad")
                                     ,data.table(performance_molten_Rd_seq.1[iter == 11], sampling = "1Rd")
                                     ,data.table(performance_molten_oneshot.1, sampling = "2oneshot"))
                    , aes( y = errorvalue, group=errortype, col=errortype)) +
                    facet_wrap(~sampling, ncol = 4) +
                    geom_boxplot(aes(colour = errortype)) +
                    ggtitle(paste0("final iteration test performace","\n"," with 4 samplings for ",model.type.1))
                    
all_range_bxp.1

Model.2

In [134]:
rmse_graph_molten_data.2 = rbind(data.table(performance_molten_Ad_seq.2[errortype == "rmse"], sampling = "2Ad", oneshot = 0)
                              ,data.table(performance_molten_oneshot.2[errortype == "rmse",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "2Ad",oneshot=1)
                              ,data.table(performance_molten_Ad_range_seq.2[errortype == "rmse"], sampling = "3Ad_range",oneshot=0)
                              ,data.table(performance_molten_oneshot.2[errortype == "rmse",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "3Ad_range",oneshot=1)
                              ,data.table(performance_molten_Rd_seq.2[errortype == "rmse"], sampling = "1Rd",oneshot=0)
                              ,data.table(performance_molten_oneshot.2[errortype == "rmse",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "1Rd",oneshot=1)
                             )

ggplot(rmse_graph_molten_data.2, aes(y = errorvalue, col=as.factor(oneshot))) + 
                  geom_boxplot()+
                  facet_grid(sampling~iter) +
                ggtitle(paste0("RMSE plots of Sequential Sampling vs oneshot(iter= 12)","\n","for dummy"))
In [135]:
mae_graph_molten_data.2 = rbind(data.table(performance_molten_Ad_seq.2[errortype == "mae"], sampling = "2Ad", oneshot = 0)
                             ,data.table(performance_molten_oneshot.2[errortype == "mae",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "2Ad",oneshot=1)
                             ,data.table(performance_molten_Ad_range_seq.2[errortype == "mae"], sampling = "3Ad_range",oneshot=0)
                             ,data.table(performance_molten_oneshot.2[errortype == "mae",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "3Ad_range",oneshot=1)
                             ,data.table(performance_molten_Rd_seq.2[errortype == "mae"], sampling = "1Rd",oneshot=0)
                             ,data.table(performance_molten_oneshot.2[errortype == "mae",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "1Rd",oneshot=1)
                             )

ggplot(mae_graph_molten_data.2, aes(y = errorvalue, col=as.factor(oneshot))) + 
                  geom_boxplot()+
                  facet_grid(sampling~iter) +
                ggtitle(paste0("MAE plots of Sequential Sampling vs oneshot(iter= 12)","\n","for dummy"))
In [136]:
mape_graph_molten_data.2 = rbind(data.table(performance_molten_Ad_seq.2[errortype == "mape"], sampling = "2Ad", oneshot = 0)
                             ,data.table(performance_molten_oneshot.2[errortype == "mape",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "2Ad",oneshot=1)
                             ,data.table(performance_molten_Ad_range_seq.2[errortype == "mape"], sampling = "3Ad_range",oneshot=0)
                             ,data.table(performance_molten_oneshot.2[errortype == "mape",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "3Ad_range",oneshot=1)
                             ,data.table(performance_molten_Rd_seq.2[errortype == "mape"], sampling = "1Rd",oneshot=0)
                             ,data.table(performance_molten_oneshot.2[errortype == "mape",.(iter = 12,seed,rep,errortype,errorvalue)], sampling = "1Rd",oneshot=1)
                             )

ggplot(mape_graph_molten_data.2, aes(y = errorvalue, col=as.factor(oneshot))) + 
                  geom_boxplot()+
                  facet_grid(sampling~iter) +
                ggtitle(paste0("MAPE plots of Sequential Sampling vs oneshot(iter= 12)","\n","for dummy"))
In [137]:
all_range_bxp.2 = ggplot(data = rbind(data.table(performance_molten_Ad_range_seq.2[iter == 11], sampling = "4Ad_range")
                                     ,data.table(performance_molten_Ad_seq.2[iter == 11], sampling = "3Ad")
                                     ,data.table(performance_molten_Rd_seq.2[iter == 11], sampling = "1Rd")
                                     ,data.table(performance_molten_oneshot.2, sampling = "2oneshot"))
                    , aes( y = errorvalue, group=errortype, col=errortype)) +
                    facet_wrap(~sampling, ncol = 4) +
                    geom_boxplot(aes(colour = errortype)) +
                    ggtitle(paste0("final iteration test performace","\n"," with 4 samplings for ",model.type.2))
                    
all_range_bxp.2

Metamodel Replications On Final Data Rd

In [138]:
####  Metamodel Replications On Final Data Rd #### 

if (model.type.1 == "basic") {
    feature_names.1 = c("density", "%-similar-wanted")
    feature_names.2 = c("density", "%-similar-wanted", "budget-multiplier-dummy", "density-multiplier-dummy", 
        "noise-dummy", "tick-limit")
} else if (model.type.1 == "dummy") {
    feature_names.1 = c("density", "%-similar-wanted", "budget-multiplier-dummy", "density-multiplier-dummy", 
        "noise-dummy", "tick-limit")
    feature_names.2 = c("density", "%-similar-wanted")
}
In [139]:
feature_names.1
feature_names.2
  1. 'density'
  2. '%-similar-wanted'
  1. 'density'
  2. '%-similar-wanted'
  3. 'budget-multiplier-dummy'
  4. 'density-multiplier-dummy'
  5. 'noise-dummy'
  6. 'tick-limit'

Model.1

In [140]:
#### Model.1 ####
mtry = 2
ntree = 300
predictedLabels_Rd_rep.1 = data.table()
performance_table_Rd_rep.1 = data.table(seed = numeric(), rep = numeric(), new_rep = numeric(),mae = numeric(), 
    rmse = numeric(), mape = numeric())

for (s in seed.Ad.1) {
    for (k in 1:10) {
        trainx.1 = FinalTrainData_Rd.1[seed == s & rep == k, .SD, .SDcols = feature_names.1]
        trainy.1 = FinalTrainData_Rd.1[seed == s & rep == k]$output
        
        test_set.1.retrain = copy(test_set.1)
        
        
        for (r in 1:10) {
            model_Sub.1 <- randomForest(x = trainx.1, y = trainy.1, importance = TRUE, 
                ntree = ntree, mtry = mtry)
            
            predictedLabels.1 <- predict(model_Sub.1, test_set.1.retrain)
            predictedLabels.1 <- cbind(test_set.1.retrain, data.table(pred_output = predictedLabels.1, 
                seed = s, rep = k, new_rep = r))
            
            predictedLabels_Rd_rep.1 <- rbind(predictedLabels_Rd_rep.1, predictedLabels.1)
            
            output_variables = colnames(select(predictedLabels.1, contains("output")))
            output_variables_1 = predictedLabels.1[, get(output_variables[1]), with = TRUE]
            output_variables_2 = predictedLabels.1[, get(output_variables[2]), with = TRUE]
            
            performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
            performance_temp[1] = mae(output_variables_1, output_variables_2)
            performance_temp[2] = rmse(output_variables_1, output_variables_2)
            performance_temp[3] = mape(output_variables_1, output_variables_2)
            
            performance_table_Rd_rep.1 = rbind(performance_table_Rd_rep.1, data.table(s, 
                r,k, performance_temp), use.names = FALSE)
            
        }
        
        rm(trainx.1, trainy.1)
    }
}
In [141]:
performance_molten_Rd.1 <- melt(data = performance_table_Rd_rep.1
                             , id.vars = c("seed","rep","new_rep"))
setnames(performance_molten_Rd.1, c("variable","value"),c("errortype","errorvalue"))

Replications on Each Seed

In [142]:
#### Replications on Each Seed ####

Rd_ind_bxp.1 = ggplot(performance_molten_Rd.1, aes(y = errorvalue, group=errortype, col=errortype)) + 
                        geom_boxplot(varwidth = TRUE)+
                        facet_wrap(~seed,ncol=3) +
                        ggtitle(paste0(" test performances","\n","with Rd_WRep Sampling for model_",model.type.1)) 
Rd_ind_bxp.1
In [143]:
bxp.Rd.1 <-  boxplot( data = performance_molten_Rd.1[errortype == "rmse"], errorvalue  ~ seed, col = "pink")
bxp.Rd.1
$stats
A matrix: 5 × 10 of type dbl
3.9961053.7822404.9449714.0387425.4259323.0043704.7264634.0307004.3348143.817322
4.2915734.0678436.2559374.3790276.2775303.7584984.9268704.4316534.6461664.283341
4.6309984.1835817.1134885.3206957.4667164.0292065.2431704.5918594.9597924.875240
4.9167565.2854828.2826416.9886227.9898154.3762275.8516084.8366666.8437405.449885
5.6096665.8583009.3359248.2054798.5492534.9148296.7551935.1695027.2957425.986098
$n
  1. 100
  2. 100
  3. 100
  4. 100
  5. 100
  6. 100
  7. 100
  8. 100
  9. 100
  10. 100
$conf
A matrix: 2 × 10 of type dbl
4.5322193.9911946.7932694.9083797.1961753.9316055.0970614.5278674.6125754.690927
4.7297774.3759687.4337075.7330117.7372584.1268075.3892784.6558515.3070095.059554
$out
  1. 6.29545093231803
  2. 6.33252941515597
  3. 6.08888492411505
  4. 6.18693591378688
  5. 6.23716296466649
  6. 6.22300525036332
  7. 6.35455067839955
  8. 6.3019609855209
  9. 6.1482006798952
  10. 6.24595031119335
$group
  1. 1
  2. 1
  3. 1
  4. 1
  5. 1
  6. 1
  7. 1
  8. 1
  9. 1
  10. 1
$names
  1. '0'
  2. '1'
  3. '2'
  4. '3'
  5. '4'
  6. '5'
  7. '6'
  8. '7'
  9. '8'
  10. '20'
In [144]:
performance_table_Rd_rep.1[, .(mean_rmse = mean(rmse)),.(seed)][order(mean_rmse)]
A data.table: 10 × 2
seedmean_rmse
<dbl><dbl>
54.022087
14.507509
74.608460
04.775076
204.849466
85.465603
65.486520
35.669241
47.209311
27.252919

Overall BoxPlot

In [145]:
Rd_bxp.1 = ggplot(data = performance_molten_Rd.1, aes(y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                   # facet_wrap(~ seed) +
                    labs( caption = paste0("meanRMSE : ",mean(performance_table_Rd_rep.1$rmse))) +
                    ggtitle(paste0("overall test performace","\n"," with Rd_WRep Sampling for ",model.type.1))
                    
Rd_bxp.1
In [146]:
boxplot.stats(performance_table_Rd_rep.1$rmse)
$stats
  1. 3.00436980960105
  2. 4.36734749228184
  3. 4.91289418616681
  4. 6.23208385498389
  5. 8.99603569771966
$n
1000
$conf
  1. 4.81972452272517
  2. 5.00606384960846
$out
  1. 9.32233364483056
  2. 9.03027311504666
  3. 9.17761223501115
  4. 9.2079752280594
  5. 9.05702500696862
  6. 9.1472114815871
  7. 9.33592427699237
  8. 9.15981186563601
  9. 9.26083773032435

Model.2

In [147]:
#### Model.2 ####
mtry = 2
ntree = 300
predictedLabels_Rd_rep.2 = data.table()
performance_table_Rd_rep.2 = data.table(seed = numeric(), rep = numeric(), new_rep = numeric(), mae = numeric(), 
    rmse = numeric(), mape = numeric())

for (s in seed.Ad.2) {
    for (k in 1:10) {
        trainx.2 = FinalTrainData_Rd.2[seed == s & rep == k, .SD, .SDcols = feature_names.2]
        trainy.2 = FinalTrainData_Rd.2[seed == s & rep == k]$output
        
        test_set.2.retrain = copy(test_set.2)
        
        
        for (r in 1:10) {
            model_Sub.2 <- randomForest(x = trainx.2, y = trainy.2, importance = TRUE, 
                ntree = ntree, mtry = mtry)
            
            predictedLabels.2 <- predict(model_Sub.2, test_set.2.retrain)
            predictedLabels.2 <- cbind(test_set.2.retrain, data.table(pred_output = predictedLabels.2, 
                seed = s, rep = r, new_rep = r))
            
            predictedLabels_Rd_rep.2 <- rbind(predictedLabels_Rd_rep.2, predictedLabels.2)
            
            output_variables = colnames(select(predictedLabels.2, contains("output")))
            output_variables_1 = predictedLabels.2[, get(output_variables[1]), with = TRUE]
            output_variables_2 = predictedLabels.2[, get(output_variables[2]), with = TRUE]
            
            performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
            performance_temp[1] = mae(output_variables_1, output_variables_2)
            performance_temp[2] = rmse(output_variables_1, output_variables_2)
            performance_temp[3] = mape(output_variables_1, output_variables_2)
            
            performance_table_Rd_rep.2 = rbind(performance_table_Rd_rep.2, data.table(s, 
                r,k, performance_temp), use.names = FALSE)
            
        }
        
        rm(trainx.2, trainy.2)
    }
}
In [148]:
performance_molten_Rd.2 <- melt(data = performance_table_Rd_rep.2
                             , id.vars = c("seed","rep","new_rep"))
setnames(performance_molten_Rd.2, c("variable","value"),c("errortype","errorvalue"))

Replications on Each Seed

In [149]:
#### Replications on Each Seed ####

Rd_ind_bxp.2 = ggplot(performance_molten_Rd.2, aes(y = errorvalue, group=errortype, col=errortype)) + 
                        geom_boxplot(varwidth = TRUE)+
                        facet_wrap(~seed,ncol=3) +
                        ggtitle(paste0(" test performances","\n","with Rd_WRep Sampling for model_",model.type.2)) 
Rd_ind_bxp.2
In [150]:
bxp.Rd.2 <-  boxplot( data = performance_molten_Rd.2[errortype == "rmse"], errorvalue  ~ seed, col = "pink")
bxp.Rd.2
$stats
A matrix: 5 × 10 of type dbl
8.501527 9.191198 9.166059 8.419463 8.344319 8.183371 8.786417 9.7463048.713798 8.680392
8.987740 9.675526 9.660424 9.227267 8.862342 8.776854 9.45573510.1600079.125836 9.248141
9.124447 9.943488 9.924739 9.529264 9.300452 9.123358 9.78466510.4645959.322629 9.413195
9.33036610.31223410.135199 9.766797 9.522447 9.320112 9.95111410.9409509.501293 9.677637
9.82216110.79715110.78657310.18341310.16988510.11210510.64206211.9972039.94884110.313212
$n
  1. 100
  2. 100
  3. 100
  4. 100
  5. 100
  6. 100
  7. 100
  8. 100
  9. 100
  10. 100
$conf
A matrix: 2 × 10 of type dbl
9.070312 9.8428889.8497249.4440189.1961569.0375249.70639510.341219.2633079.345335
9.17858210.0440879.9997539.6145109.4047499.2091939.86293410.587989.3819519.481056
$out
  1. 8.40907731312565
  2. 9.93290516570438
  3. 8.4626745422317
  4. 8.39834359624836
  5. 8.41804543328901
  6. 8.46180317883929
  7. 8.15825919299355
  8. 8.44166099099455
  9. 8.9448589808662
  10. 8.26916261683614
  11. 7.95988111708694
  12. 8.27727368065007
  13. 8.34975378817249
  14. 8.17245843468791
  15. 8.31181072163272
  16. 7.97520451905435
  17. 12.1125053092027
  18. 12.2881426073606
  19. 12.4630755429888
  20. 12.1700966762024
  21. 12.3443397015758
  22. 12.6383733345971
  23. 12.2292345480596
  24. 10.240841172984
$group
  1. 1
  2. 1
  3. 1
  4. 1
  5. 1
  6. 1
  7. 1
  8. 1
  9. 3
  10. 4
  11. 4
  12. 4
  13. 4
  14. 4
  15. 4
  16. 4
  17. 8
  18. 8
  19. 8
  20. 8
  21. 8
  22. 8
  23. 8
  24. 9
$names
  1. '0'
  2. '1'
  3. '2'
  4. '3'
  5. '4'
  6. '5'
  7. '6'
  8. '7'
  9. '8'
  10. '20'
In [151]:
performance_table_Rd_rep.2[, .(mean_rmse = mean(rmse)),.(seed)][order(mean_rmse)]
A data.table: 10 × 2
seedmean_rmse
<dbl><dbl>
5 9.067760
0 9.122736
4 9.212541
8 9.317767
3 9.451310
20 9.467711
6 9.720779
2 9.908731
1 9.996092
710.625330

Overall BoxPlot

In [152]:
Rd_bxp.2 = ggplot(data = performance_molten_Rd.2, aes(y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                   # facet_wrap(~ seed) +
                    labs( caption = paste0("meanRMSE : ",mean(performance_table_Rd_rep.2$rmse))) +
                    ggtitle(paste0("overall test performace","\n"," with Rd_WRep Sampling for ",model.type.2))
                    
Rd_bxp.2
In [153]:
boxplot.stats(performance_table_Rd_rep.2$rmse)
$stats
  1. 8.15825919299355
  2. 9.19485308277183
  3. 9.52587221678016
  4. 9.92606297201545
  5. 11.0223645685022
$n
1000
$conf
  1. 9.4893380553573
  2. 9.56240637820302
$out
  1. 7.95988111708694
  2. 7.97520451905435
  3. 11.0634742324356
  4. 11.319777197386
  5. 11.1119115643922
  6. 11.2751402715054
  7. 11.0242756027285
  8. 11.1339801451817
  9. 11.350095167144
  10. 12.1125053092027
  11. 12.2881426073606
  12. 12.4630755429888
  13. 12.1700966762024
  14. 11.9972025842525
  15. 11.8034306400299
  16. 12.3443397015758
  17. 12.6383733345971
  18. 11.9707008171475
  19. 12.2292345480596
  20. 11.1117584870661
  21. 11.2532313585056
  22. 11.1407716812469
  23. 11.0559869544991

Metamodel Replications On Final Data Ad

Model.1

In [154]:
#### Model.1 ####
mtry = 2
ntree = 300
predictedLabels_Ad_rep.1 = data.table()
performance_table_Ad_rep.1 = data.table(seed = numeric(), rep = numeric(), new_rep = numeric(), mae = numeric(), 
    rmse = numeric(), mape = numeric())

for (s in seed.Ad.1) {
    for (k in 1:10) {
        trainx.1 = FinalTrainData_Ad.1[seed == s & rep == k, .SD, .SDcols = feature_names.1]
        trainy.1 = FinalTrainData_Ad.1[seed == s & rep == k]$output
        
        test_set.1.retrain = copy(test_set.1)
        
        
        for (r in 1:10) {
            model_Sub.1 <- randomForest(x = trainx.1, y = trainy.1, importance = TRUE, 
                ntree = ntree, mtry = mtry)
            
            predictedLabels.1 <- predict(model_Sub.1, test_set.1.retrain)
            predictedLabels.1 <- cbind(test_set.1.retrain, data.table(pred_output = predictedLabels.1, 
                seed = s, rep = r, new_rep = k))
            
            predictedLabels_Ad_rep.1 <- rbind(predictedLabels_Ad_rep.1, predictedLabels.1)
            
            output_variables = colnames(select(predictedLabels.1, contains("output")))
            output_variables_1 = predictedLabels.1[, get(output_variables[1]), with = TRUE]
            output_variables_2 = predictedLabels.1[, get(output_variables[2]), with = TRUE]
            
            performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
            performance_temp[1] = mae(output_variables_1, output_variables_2)
            performance_temp[2] = rmse(output_variables_1, output_variables_2)
            performance_temp[3] = mape(output_variables_1, output_variables_2)
            
            performance_table_Ad_rep.1 = rbind(performance_table_Ad_rep.1, data.table(s, 
                r,k, performance_temp), use.names = FALSE)
            
        }
        
        rm(trainx.1, trainy.1)
    }
}
In [162]:
performance_molten_Ad.1 <- melt(data = performance_table_Ad_rep.1
                             , id.vars = c("seed","rep","new_rep"))
setnames(performance_molten_Ad.1, c("variable","value"),c("errortype","errorvalue"))

Replications on Each Seed

In [163]:
#### Replications on Each Seed ####

Ad_ind_bxp.1 = ggplot(performance_molten_Ad.1, aes(y = errorvalue, group=errortype, col=errortype)) + 
                        geom_boxplot(varwidth = TRUE)+
                        facet_wrap(~seed,ncol=3) +
                        ggtitle(paste0(" test performances","\n","with Ad_WRep Sampling for model_",model.type.1)) 
Ad_ind_bxp.1
In [164]:
bxp.Ad.1 <-  boxplot( data = performance_molten_Ad.1[errortype == "rmse"], errorvalue  ~ seed, col = "pink")
bxp.Ad.1
$stats
A matrix: 5 × 10 of type dbl
3.8653413.9298484.3647053.9135165.0886173.1329014.4193353.7171534.4606303.773842
4.1169404.0774574.7127314.1970725.3043393.3347154.6875383.9218834.7043014.007472
4.2152664.1350124.8639614.3289705.3945173.3986864.8099544.0123164.9002784.091639
4.2914024.1902074.9897624.4607665.4888453.5034244.9032384.1026695.0636264.192640
4.4769144.3283035.2224804.8213875.7431713.6607515.1392574.3303885.4090344.439938
$n
  1. 100
  2. 100
  3. 100
  4. 100
  5. 100
  6. 100
  7. 100
  8. 100
  9. 100
  10. 100
$conf
A matrix: 2 × 10 of type dbl
4.1877014.1171984.8201904.2873065.3653653.3720304.7758733.9837524.8435054.062383
4.2428314.1528274.9077324.3706335.4236693.4253424.8440344.0408814.9570524.120896
$out
  1. 4.42835692712547
  2. 4.40246074932302
  3. 3.81186670798459
  4. 4.406465584927
  5. 5.82655356871349
  6. 5.76833506546536
$group
  1. 2
  2. 2
  3. 2
  4. 2
  5. 5
  6. 5
$names
  1. '0'
  2. '1'
  3. '2'
  4. '3'
  5. '4'
  6. '5'
  7. '6'
  8. '7'
  9. '8'
  10. '20'
In [165]:
performance_table_Ad_rep.1[, .(mean_rmse = mean(rmse)),.(seed)][order(mean_rmse)]
A data.table: 10 × 2
seedmean_rmse
<dbl><dbl>
53.411027
74.015857
204.096519
14.132631
04.216448
34.328317
64.795610
24.845807
84.885793
45.410051

Overall BoxPlot

In [166]:
Ad_bxp.1 = ggplot(data = performance_molten_Ad.1, aes(y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                   # facet_wrap(~ seed) +
                    labs( caption = paste0("meanRMSE : ",mean(performance_table_Ad_rep.1$rmse))) +
                    ggtitle(paste0("overall test performace","\n"," with Ad_WRep Sampling for ",model.type.1))
                    
Ad_bxp.1
In [167]:
boxplot.stats(performance_table_Ad_rep.1$rmse)
$stats
  1. 3.13290133884073
  2. 4.06569838627166
  3. 4.2833141901022
  4. 4.84707251099534
  5. 5.82655356871349
$n
1000
$conf
  1. 4.24427362346842
  2. 4.32235475673599
$out

Model.2

In [168]:
#### Model.2 ####
mtry = 2
ntree = 300
predictedLabels_Ad_rep.2 = data.table()
performance_table_Ad_rep.2 = data.table(seed = numeric(), rep = numeric(), new_rep = numeric(), mae = numeric(), 
    rmse = numeric(), mape = numeric())

for (s in seed.Ad.2) {
    for (k in 1:10) {
        trainx.2 = FinalTrainData_Ad.2[seed == s & rep == k, .SD, .SDcols = feature_names.2]
        trainy.2 = FinalTrainData_Ad.2[seed == s & rep == k]$output
        
        test_set.2.retrain = copy(test_set.2)
        
        
        for (r in 1:10) {
            model_Sub.2 <- randomForest(x = trainx.2, y = trainy.2, importance = TRUE, 
                ntree = ntree, mtry = mtry)
            
            predictedLabels.2 <- predict(model_Sub.2, test_set.2.retrain)
            predictedLabels.2 <- cbind(test_set.2.retrain, data.table(pred_output = predictedLabels.2, 
                seed = s, rep = r, new_rep = k))
            
            predictedLabels_Ad_rep.2 <- rbind(predictedLabels_Ad_rep.2, predictedLabels.2)
            
            output_variables = colnames(select(predictedLabels.2, contains("output")))
            output_variables_1 = predictedLabels.2[, get(output_variables[1]), with = TRUE]
            output_variables_2 = predictedLabels.2[, get(output_variables[2]), with = TRUE]
            
            performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
            performance_temp[1] = mae(output_variables_1, output_variables_2)
            performance_temp[2] = rmse(output_variables_1, output_variables_2)
            performance_temp[3] = mape(output_variables_1, output_variables_2)
            
            performance_table_Ad_rep.2 = rbind(performance_table_Ad_rep.2, data.table(s, 
                r,k, performance_temp), use.names = FALSE)
            
        }
        
        rm(trainx.2, trainy.2)
    }
}
In [169]:
performance_molten_Ad.2 <- melt(data = performance_table_Ad_rep.2
                             , id.vars = c("seed","rep","new_rep"))
setnames(performance_molten_Ad.2, c("variable","value"),c("errortype","errorvalue"))

Replications on Each Seed

In [170]:
#### Replications on Each Seed ####

Ad_ind_bxp.2 = ggplot(performance_molten_Ad.2, aes(y = errorvalue, group=errortype, col=errortype)) + 
                        geom_boxplot(varwidth = TRUE)+
                        facet_wrap(~seed,ncol=3) +
                        ggtitle(paste0(" test performances","\n","with Ad_WRep Sampling for model_",model.type.2)) 
Ad_ind_bxp.2
In [171]:
bxp.Ad.2 <-  boxplot( data = performance_molten_Ad.2[errortype == "rmse"], errorvalue  ~ seed, col = "pink")
bxp.Ad.2
$stats
A matrix: 5 × 10 of type dbl
7.5563448.0763758.2439877.8368907.2162817.5023277.9381258.4322207.1404167.455889
7.9830558.5310258.5608818.2101207.7698077.7855328.4070328.9175367.5546287.872776
8.1123178.6474068.6918458.3739448.0259907.9260898.5622849.1265927.6969338.115387
8.2742308.8589898.8480328.5517448.2558418.0761858.7869039.2947637.8438538.281212
8.6820249.3463349.2465718.8790208.6735368.3335939.3476809.6974918.1593038.591441
$n
  1. 100
  2. 100
  3. 100
  4. 100
  5. 100
  6. 100
  7. 100
  8. 100
  9. 100
  10. 100
$conf
A matrix: 2 × 10 of type dbl
8.0663128.5955878.6464758.3199677.9491977.8801668.5022659.0669907.6512368.050854
8.1583238.6992248.7372158.4279218.1027847.9720138.6223049.1861947.7426318.179920
$out
  1. 8.72522178981896
  2. 8.94583553684261
  3. 9.29871159351912
  4. 9.15890474214313
  5. 7.3436047566011
  6. 7.32723463249906
$group
  1. 1
  2. 1
  3. 3
  4. 4
  5. 6
  6. 6
$names
  1. '0'
  2. '1'
  3. '2'
  4. '3'
  5. '4'
  6. '5'
  7. '6'
  8. '7'
  9. '8'
  10. '20'
In [172]:
performance_table_Ad_rep.2[, .(mean_rmse = mean(rmse)),.(seed)][order(mean_rmse)]
A data.table: 10 × 2
seedmean_rmse
<dbl><dbl>
87.683813
57.908511
48.014256
208.093005
08.144915
38.384887
68.595063
18.690371
28.714778
79.123498

Overall BoxPlot

In [173]:
Ad_bxp.2 = ggplot(data = performance_molten_Ad.2, aes(y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                   # facet_wrap(~ seed) +
                    labs( caption = paste0("meanRMSE : ",mean(performance_table_Ad_rep.2$rmse))) +
                    ggtitle(paste0("overall test performace","\n"," with Ad_WRep Sampling for ",model.type.2))
                    
Ad_bxp.2
In [174]:
boxplot.stats(performance_table_Ad_rep.2$rmse)
$stats
  1. 7.14041591478462
  2. 7.94869577709313
  3. 8.30478397443775
  4. 8.67149652715128
  5. 9.69749132658547
$n
1000
$conf
  1. 8.26866996713609
  2. 8.34089798173941
$out

Metamodel Replications On Final Data Ad_range

Model.1

In [175]:
#### Model.1 ####
mtry = 2
ntree = 300
predictedLabels_Ad_range_rep.1 = data.table()
performance_table_Ad_range_rep.1 = data.table(seed = numeric(), rep = numeric(), new_rep = numeric(), 
    mae = numeric(), rmse = numeric(), mape = numeric())

for (s in seed.Ad.1) {
    for (k in 1:10) {
        trainx.1 = FinalTrainData_Ad_range.1[seed == s & rep == k, .SD, .SDcols = feature_names.1]
        trainy.1 = FinalTrainData_Ad_range.1[seed == s & rep == k]$output
        
        test_set.1.retrain = copy(test_set.1)
        
        
        for (r in 1:10) {
            model_Sub.1 <- randomForest(x = trainx.1, y = trainy.1, importance = TRUE, 
                ntree = ntree, mtry = mtry)
            
            predictedLabels.1 <- predict(model_Sub.1, test_set.1.retrain)
            predictedLabels.1 <- cbind(test_set.1.retrain, data.table(pred_output = predictedLabels.1, 
                seed = s, rep = r, new_rep = k))
            
            predictedLabels_Ad_range_rep.1 <- rbind(predictedLabels_Ad_range_rep.1, 
                predictedLabels.1)
            
            output_variables = colnames(select(predictedLabels.1, contains("output")))
            output_variables_1 = predictedLabels.1[, get(output_variables[1]), with = TRUE]
            output_variables_2 = predictedLabels.1[, get(output_variables[2]), with = TRUE]
            
            performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
            performance_temp[1] = mae(output_variables_1, output_variables_2)
            performance_temp[2] = rmse(output_variables_1, output_variables_2)
            performance_temp[3] = mape(output_variables_1, output_variables_2)
            
            performance_table_Ad_range_rep.1 = rbind(performance_table_Ad_range_rep.1, 
                data.table(s, r,k, performance_temp), use.names = FALSE)
            
        }
        
        rm(trainx.1, trainy.1)
    }
}
In [176]:
performance_molten_Ad_range.1 <- melt(data = performance_table_Ad_range_rep.1
                             , id.vars = c("seed","rep","new_rep"))
setnames(performance_molten_Ad_range.1, c("variable","value"),c("errortype","errorvalue"))

Replications on Each Seed

In [177]:
#### Replications on Each Seed ####

Ad_range_ind_bxp.1 = ggplot(performance_molten_Ad_range.1, aes(y = errorvalue, group=errortype, col=errortype)) + 
                        geom_boxplot(varwidth = TRUE)+
                        facet_wrap(~seed,ncol=3) +
                        ggtitle(paste0(" test performances","\n","with Ad_range_WRep Sampling for model_",model.type.1)) 
Ad_range_ind_bxp.1
In [178]:
bxp.Ad_range.1 <-  boxplot( data = performance_molten_Ad_range.1[errortype == "rmse"], errorvalue  ~ seed, col = "pink")
bxp.Ad_range.1
$stats
A matrix: 5 × 10 of type dbl
3.8183933.7489284.4387633.9496065.3938883.3065744.5512933.7415184.2023424.021757
4.0612213.9499304.7014564.2366075.6774573.4618194.8524123.9490744.6774094.217174
4.2404444.0437734.8022994.4089145.8140223.5095864.9948554.1218474.8442764.304086
4.3558274.1646414.9278194.5613795.9556903.5671065.1582634.2616085.0845174.391036
4.6119204.3273595.2649634.8629686.3135213.7077115.5970794.5289405.4213374.579706
$n
  1. 100
  2. 100
  3. 100
  4. 100
  5. 100
  6. 100
  7. 100
  8. 100
  9. 100
  10. 100
$conf
A matrix: 2 × 10 of type dbl
4.1938964.0098494.7665344.3576005.7700613.4929514.9465304.0724664.7799534.276615
4.2869924.0776984.8380654.4602285.8579833.5262225.0431794.1712274.9085994.331556
$out
  1. 4.55407969834722
  2. 4.3001141645151
  3. 3.236020068152
  4. 3.79394615849665
  5. 3.74654949793306
  6. 4.0357621479291
$group
  1. 2
  2. 3
  3. 6
  4. 6
  5. 6
  6. 9
$names
  1. '0'
  2. '1'
  3. '2'
  4. '3'
  5. '4'
  6. '5'
  7. '6'
  8. '7'
  9. '8'
  10. '20'
In [179]:
performance_table_Ad_range_rep.1[, .(mean_rmse = mean(rmse)),.(seed)][order(mean_rmse)]
A data.table: 10 × 2
seedmean_rmse
<dbl><dbl>
53.510689
14.058241
74.109687
04.223673
204.306827
34.397412
24.809019
84.850239
65.014807
45.830045

Overall BoxPlot

In [180]:
Ad_range_bxp.1 = ggplot(data = performance_molten_Ad_range.1, aes(y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                   # facet_wrap(~ seed) +
                    labs( caption = paste0("meanRMSE : ",mean(performance_table_Ad_range_rep.1$rmse))) +
                    ggtitle(paste0("overall test performace","\n"," with Ad_range_WRep Sampling for ",model.type.1))
                    
Ad_range_bxp.1
In [181]:
boxplot.stats(performance_table_Ad_range_rep.1$rmse)
$stats
  1. 3.236020068152
  2. 4.11119578194699
  3. 4.38708245238811
  4. 4.88726519410391
  5. 6.05064921883661
$n
1000
$conf
  1. 4.34830693034421
  2. 4.42585797443201
$out
  1. 6.19419904483392
  2. 6.06946996581539
  3. 6.12830323427626
  4. 6.15049825680253
  5. 6.18704147668894
  6. 6.13588354334014
  7. 6.31352066780396
  8. 6.07140537102103
  9. 6.1246691137084
  10. 6.07789284275411
  11. 6.18181698364797
  12. 6.08547924001666
  13. 6.0880613724512
  14. 6.10437122566506

Model.2

In [182]:
#### Model.2 ####
mtry = 2
ntree = 300
predictedLabels_Ad_range_rep.2 = data.table()
performance_table_Ad_range_rep.2 = data.table(seed = numeric(), rep = numeric(), new_rep = numeric(), 
    mae = numeric(), rmse = numeric(), mape = numeric())

for (s in seed.Ad.2) {
    for (k in 1:10) {
        trainx.2 = FinalTrainData_Ad_range.2[seed == s & rep == k, .SD, .SDcols = feature_names.2]
        trainy.2 = FinalTrainData_Ad_range.2[seed == s & rep == k]$output
        
        test_set.2.retrain = copy(test_set.2)
        
        
        for (r in 1:10) {
            model_Sub.2 <- randomForest(x = trainx.2, y = trainy.2, importance = TRUE, 
                ntree = ntree, mtry = mtry)
            
            predictedLabels.2 <- predict(model_Sub.2, test_set.2.retrain)
            predictedLabels.2 <- cbind(test_set.2.retrain, data.table(pred_output = predictedLabels.2, 
                seed = s, rep = r, new_rep = k))
            
            predictedLabels_Ad_range_rep.2 <- rbind(predictedLabels_Ad_range_rep.2, 
                predictedLabels.2)
            
            output_variables = colnames(select(predictedLabels.2, contains("output")))
            output_variables_1 = predictedLabels.2[, get(output_variables[1]), with = TRUE]
            output_variables_2 = predictedLabels.2[, get(output_variables[2]), with = TRUE]
            
            performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
            performance_temp[1] = mae(output_variables_1, output_variables_2)
            performance_temp[2] = rmse(output_variables_1, output_variables_2)
            performance_temp[3] = mape(output_variables_1, output_variables_2)
            
            performance_table_Ad_range_rep.2 = rbind(performance_table_Ad_range_rep.2, 
                data.table(s,r,k,performance_temp), use.names = FALSE)
            
        }
        
        rm(trainx.2, trainy.2)
    }
}
In [183]:
performance_molten_Ad_range.2 <- melt(data = performance_table_Ad_range_rep.2
                             , id.vars = c("seed","rep","new_rep"))
setnames(performance_molten_Ad_range.2, c("variable","value"),c("errortype","errorvalue"))

Replications on Each Seed

In [184]:
#### Replications on Each Seed ####

Ad_range_ind_bxp.2 = ggplot(performance_molten_Ad_range.2, aes(y = errorvalue, group=errortype, col=errortype)) + 
                        geom_boxplot(varwidth = TRUE)+
                        facet_wrap(~seed,ncol=3) +
                        ggtitle(paste0(" test performances","\n","with Ad_range_WRep Sampling for model_",model.type.2)) 
Ad_range_ind_bxp.2
In [185]:
bxp.Ad_range.2 <-  boxplot( data = performance_molten_Ad_range.2[errortype == "rmse"], errorvalue  ~ seed, col = "pink")
bxp.Ad_range.2
$stats
A matrix: 5 × 10 of type dbl
7.764047 9.023495 8.4699507.7548207.5174887.7028818.408855 9.2720737.4537097.724917
8.393300 9.455171 9.1799008.3988718.2329248.1929098.947572 9.7546547.8432848.263263
8.611123 9.651954 9.3694938.5842008.5594668.3821929.134597 9.9263948.0083698.492758
8.864564 9.963729 9.6874138.8503498.8018278.5546399.41871810.1857218.1987548.687359
9.35858010.49898010.1903919.4571289.4572129.0186489.86458210.6663438.6266929.101653
$n
  1. 100
  2. 100
  3. 100
  4. 100
  5. 100
  6. 100
  7. 100
  8. 100
  9. 100
  10. 100
$conf
A matrix: 2 × 10 of type dbl
8.5366639.5716029.2893068.5128668.4695798.3250389.0601569.8582857.9522058.425751
8.6855839.7323069.4496808.6555338.6493528.4393459.2090389.9945028.0645348.559765
$out
  1. 9.02052723271645
  2. 8.67973352323266
  3. 10.9630325277248
  4. 11.0145385008978
$group
  1. 8
  2. 8
  3. 8
  4. 8
$names
  1. '0'
  2. '1'
  3. '2'
  4. '3'
  5. '4'
  6. '5'
  7. '6'
  8. '7'
  9. '8'
  10. '20'
In [186]:
performance_table_Ad_range_rep.2[, .(mean_rmse = mean(rmse)),.(seed)][order(mean_rmse)]
A data.table: 10 × 2
seedmean_rmse
<dbl><dbl>
88.036103
58.377743
208.479383
48.553044
08.625354
38.633567
69.170476
29.419766
19.711648
79.960170

Overall BoxPlot

In [187]:
Ad_range_bxp.2 = ggplot(data = performance_molten_Ad_range.2, aes(y = errorvalue, group=errortype, col=errortype)) +
                    geom_boxplot(aes(colour = errortype)) +
                   # facet_wrap(~ seed) +
                    labs( caption = paste0("meanRMSE : ",mean(performance_table_Ad_range_rep.2$rmse))) +
                    ggtitle(paste0("overall test performace","\n"," with Ad_range_WRep Sampling for ",model.type.2))
                    
Ad_range_bxp.2
In [188]:
boxplot.stats(performance_table_Ad_range_rep.2$rmse)
$stats
  1. 7.45370866663847
  2. 8.36742350151088
  3. 8.79353862359415
  4. 9.40024312948419
  5. 10.6663426845517
$n
1000
$conf
  1. 8.74193483709708
  2. 8.84514241009122
$out
  1. 10.9630325277248
  2. 11.0145385008978

4 Scenarios_Part 2

Results shown here includes:

  • Rd,Ad,Ad_range samplings' boxplots --> 1000 data points
  • oneshot's boxplot --> 100 data points

Model.1

In [191]:
head(performance_molten_Rd.1)
A data.table: 6 × 5
seedrepnew_reperrortypeerrorvalue
<dbl><dbl><dbl><fct><dbl>
011mae2.738111
021mae2.846628
031mae2.807172
041mae2.755153
051mae2.757226
061mae2.726611
In [196]:
all_rep_ind_bxp.1 = ggplot(rbind( data.table(performance_molten_Ad_range.1[,.(errortype,errorvalue)], sampling = "4Ad_range")
                                 ,data.table(performance_molten_Ad.1[,.(errortype,errorvalue)], sampling = "3Ad")
                                 ,data.table(performance_molten_Rd.1[,.(errortype,errorvalue)], sampling = "1Rd")
                                 ,data.table(performance_molten_oneshot.1[,.(errortype,errorvalue)], sampling = "2oneshot"))    
                           , aes(y = errorvalue, group=errortype, col=errortype)) + 
                        geom_boxplot(varwidth = TRUE)+
                        facet_grid(~sampling) +
                        ggtitle(paste0(" test performances after replication","\n","with 4 Samplings for model_",model.type.1)) 
all_rep_ind_bxp.1
In [197]:
grid.arrange(all_range_bxp.1,all_rep_ind_bxp.1)

Model.2

In [198]:
all_rep_ind_bxp.2 = ggplot(rbind( data.table(performance_molten_Ad_range.2[,.(errortype,errorvalue)], sampling = "4Ad_range")
                                ,data.table(performance_molten_Ad.2[,.(errortype,errorvalue)], sampling = "3Ad")
                                 ,data.table(performance_molten_Rd.2[,.(errortype,errorvalue)], sampling = "1Rd")
                                 ,data.table(performance_molten_oneshot.2[,.(errortype,errorvalue)], sampling = "2oneshot"))    
                           , aes(y = errorvalue, group=errortype, col=errortype)) + 
                        geom_boxplot(varwidth = TRUE)+
                        facet_grid(~sampling) +
                        ggtitle(paste0(" test performances after replication","\n","with 4 Samplings for model_",model.type.2)) 
all_rep_ind_bxp.2
In [199]:
grid.arrange(all_range_bxp.2,all_rep_ind_bxp.2)